diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index cd5f2c3012712..cb08e2107f072 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -311,7 +311,7 @@ let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
   def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
 }
 
-let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
@@ -605,8 +605,7 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
 
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
-                              "_Vector<32, char>, _Constant int)">;
+
   def psadbw256
       : X86Builtin<
             "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
@@ -630,6 +629,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
   def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
   def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
 
   def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
   def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
@@ -3263,7 +3263,7 @@ let Features = "avx512bw", Attributes = [NoThrow, Const] in {
   def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 0ef130c0a55df..6c7b2f502cc51 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2841,76 +2841,6 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
-                                        const CallExpr *Call) {
-  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
-  const Pointer &Control = S.Stk.pop<Pointer>();
-  const Pointer &Src = S.Stk.pop<Pointer>();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-
-  unsigned NumElems = Dst.getNumElems();
-  assert(NumElems == Control.getNumElems());
-  assert(NumElems == Dst.getNumElems());
-
-  for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
-    uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
-
-    if (Ctlb & 0x80) {
-      Dst.elem<int8_t>(Idx) = 0;
-    } else {
-      unsigned LaneBase = (Idx / 16) * 16;
-      unsigned SrcOffset = Ctlb & 0x0F;
-      unsigned SrcIdx = LaneBase + SrcOffset;
-
-      Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
-    }
-  }
-  Dst.initializeAllElements();
-  return true;
-}
-
-static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
-                                       const CallExpr *Call, bool IsShufHW) {
-  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
-  APSInt ControlImm = popToAPSInt(S, Call->getArg(1));
-  const Pointer &Src = S.Stk.pop<Pointer>();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-
-  unsigned NumElems = Dst.getNumElems();
-  PrimType ElemT = Dst.getFieldDesc()->getPrimType();
-
-  unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
-  if (ElemBits != 16 && ElemBits != 32)
-    return false;
-
-  unsigned LaneElts = 128u / ElemBits;
-  assert(LaneElts && (NumElems % LaneElts == 0));
-
-  uint8_t Ctl = static_cast<uint8_t>(ControlImm.getZExtValue());
-
-  for (unsigned Idx = 0; Idx != NumElems; Idx++) {
-    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
-    unsigned LaneIdx = Idx % LaneElts;
-    unsigned SrcIdx = Idx;
-    unsigned Sel = (Ctl >> (2 * (LaneIdx & 0x3))) & 0x3;
-    if (ElemBits == 32) {
-      SrcIdx = LaneBase + Sel;
-    } else {
-      constexpr unsigned HalfSize = 4;
-      bool InHigh = LaneIdx >= HalfSize;
-      if (!IsShufHW && !InHigh) {
-        SrcIdx = LaneBase + Sel;
-      } else if (IsShufHW && InHigh) {
-        SrcIdx = LaneBase + HalfSize + Sel;
-      }
-    }
-
-    INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(Idx) = Src.elem<T>(SrcIdx); });
-  }
-  Dst.initializeAllElements();
-  return true;
-}
-
 static bool interp__builtin_ia32_test_op(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<bool(const APInt &A, const APInt &B)> Fn) {
@@ -3377,61 +3307,46 @@ static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_x86_byteshift(
-    InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID,
-    llvm::function_ref<APInt(const Pointer &, unsigned Lane, unsigned I,
-                             unsigned Shift)>
-        Fn) {
-  assert(Call->getNumArgs() == 2);
-
-  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
-  uint64_t Shift = ImmAPS.getZExtValue() & 0xff;
-
-  const Pointer &Src = S.Stk.pop<Pointer>();
-  if (!Src.getFieldDesc()->isPrimitiveArray())
-    return false;
-
-  unsigned NumElems = Src.getNumElems();
-  const Pointer &Dst = S.Stk.peek<Pointer>();
-  PrimType ElemT = Src.getFieldDesc()->getPrimType();
-
-  for (unsigned Lane = 0; Lane != NumElems; Lane += 16) {
-    for (unsigned I = 0; I != 16; ++I) {
-      unsigned Base = Lane + I;
-      APSInt Result = APSInt(Fn(Src, Lane, I, Shift));
-      INT_TYPE_SWITCH_NO_BOOL(ElemT,
-                              { Dst.elem<T>(Base) = static_cast<T>(Result); });
-    }
-  }
-
-  Dst.initializeAllElements();
-
-  return true;
-}
-
 static bool interp__builtin_ia32_shuffle_generic(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
         GetSourceIndex) {
 
-  assert(Call->getNumArgs() == 3);
+  assert(Call->getNumArgs() == 2 || Call->getNumArgs() == 3);
 
   unsigned ShuffleMask = 0;
   Pointer A, MaskVector, B;
-
-  QualType Arg2Type = Call->getArg(2)->getType();
   bool IsVectorMask = false;
-  if (Arg2Type->isVectorType()) {
-    IsVectorMask = true;
-    B = S.Stk.pop<Pointer>();
-    MaskVector = S.Stk.pop<Pointer>();
-    A = S.Stk.pop<Pointer>();
-  } else if (Arg2Type->isIntegerType()) {
-    ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
-    B = S.Stk.pop<Pointer>();
-    A = S.Stk.pop<Pointer>();
+  bool IsSingleOperand = (Call->getNumArgs() == 2);
+
+  if (IsSingleOperand) {
+    QualType MaskType = Call->getArg(1)->getType();
+    if (MaskType->isVectorType()) {
+      IsVectorMask = true;
+      MaskVector = S.Stk.pop<Pointer>();
+      A = S.Stk.pop<Pointer>();
+      B = A;
+    } else if (MaskType->isIntegerType()) {
+      ShuffleMask = popToAPSInt(S, Call->getArg(1)).getZExtValue();
+      A = S.Stk.pop<Pointer>();
+      B = A;
+    } else {
+      return false;
+    }
   } else {
-    return false;
+    QualType Arg2Type = Call->getArg(2)->getType();
+    if (Arg2Type->isVectorType()) {
+      IsVectorMask = true;
+      B = S.Stk.pop<Pointer>();
+      MaskVector = S.Stk.pop<Pointer>();
+      A = S.Stk.pop<Pointer>();
+    } else if (Arg2Type->isIntegerType()) {
+      ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+      B = S.Stk.pop<Pointer>();
+      A = S.Stk.pop<Pointer>();
+    } else {
+      return false;
+    }
   }
 
   QualType Arg0Type = Call->getArg(0)->getType();
@@ -3455,6 +3370,7 @@ static bool interp__builtin_ia32_shuffle_generic(
         ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
       });
     }
+
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
@@ -4555,22 +4471,58 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
-    return interp__builtin_ia32_pshufb(S, OpPC, Call);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          uint8_t Ctlb = static_cast<uint8_t>(ShuffleMask);
+          if (Ctlb & 0x80)
+            return std::make_pair(0, -1);
+
+          unsigned LaneBase = (DstIdx / 16) * 16;
+          unsigned SrcOffset = Ctlb & 0x0F;
+          unsigned SrcIdx = LaneBase + SrcOffset;
+          return std::make_pair(0, static_cast<int>(SrcIdx));
+        });
 
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512:
-    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned LaneBase = (DstIdx / 8) * 8;
+          unsigned LaneIdx = DstIdx % 8;
+          if (LaneIdx < 4) {
+            unsigned Sel = (ShuffleMask >> (2 * LaneIdx)) & 0x3;
+            return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+          }
+
+          return std::make_pair(0, static_cast<int>(DstIdx));
+        });
 
   case X86::BI__builtin_ia32_pshufhw:
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512:
-    return interp__builtin_ia32_pshuf(S, OpPC, Call, true);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned LaneBase = (DstIdx / 8) * 8;
+          unsigned LaneIdx = DstIdx % 8;
+          if (LaneIdx >= 4) {
+            unsigned Sel = (ShuffleMask >> (2 * (LaneIdx - 4))) & 0x3;
+            return std::make_pair(0, static_cast<int>(LaneBase + 4 + Sel));
+          }
+
+          return std::make_pair(0, static_cast<int>(DstIdx));
+        });
 
   case X86::BI__builtin_ia32_pshufd:
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512:
-    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned LaneBase = (DstIdx / 4) * 4;
+          unsigned LaneIdx = DstIdx % 4;
+          unsigned Sel = (ShuffleMask >> (2 * LaneIdx)) & 0x3;
+          return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+        });
 
   case X86::BI__builtin_ia32_kandqi:
   case X86::BI__builtin_ia32_kandhi:
@@ -4728,13 +4680,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     // The lane width is hardcoded to 16 to match the SIMD register size,
     // but the algorithm processes one byte per iteration,
     // so APInt(8, ...) is correct and intentional.
-    return interp__builtin_x86_byteshift(
-        S, OpPC, Call, BuiltinID,
-        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
-          if (I < Shift) {
-            return APInt(8, 0);
-          }
-          return APInt(8, Src.elem<uint8_t>(Lane + I - Shift));
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call,
+        [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+          unsigned LaneBase = (DstIdx / 16) * 16;
+          unsigned LaneIdx = DstIdx % 16;
+          if (LaneIdx < Shift)
+            return std::make_pair(0, -1);
+
+          return std::make_pair(0,
+                                static_cast<int>(LaneBase + LaneIdx - Shift));
         });
 
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
@@ -4744,14 +4699,40 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     // The lane width is hardcoded to 16 to match the SIMD register size,
     // but the algorithm processes one byte per iteration,
     // so APInt(8, ...) is correct and intentional.
-    return interp__builtin_x86_byteshift(
-        S, OpPC, Call, BuiltinID,
-        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
-          if (I + Shift < 16) {
-            return APInt(8, Src.elem<uint8_t>(Lane + I + Shift));
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call,
+        [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+          unsigned LaneBase = (DstIdx / 16) * 16;
+          unsigned LaneIdx = DstIdx % 16;
+          if (LaneIdx + Shift < 16)
+            return std::make_pair(0,
+                                  static_cast<int>(LaneBase + LaneIdx + Shift));
+
+          return std::make_pair(0, -1);
+        });
+
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned Shift) {
+          // Default to -1 → zero-fill this destination element
+          unsigned VecIdx = 1;
+          int ElemIdx = -1;
+
+          int Lane = DstIdx / 16;
+          int Offset = DstIdx % 16;
+
+          // Elements come from VecB first, then VecA after the shift boundary
+          unsigned ShiftedIdx = Offset + (Shift & 0xFF);
+          if (ShiftedIdx < 16) { // from VecB
+            ElemIdx = ShiftedIdx + (Lane * 16);
+          } else if (ShiftedIdx < 32) { // from VecA
+            VecIdx = 0;
+            ElemIdx = (ShiftedIdx - 16) + (Lane * 16);
           }
 
-          return APInt(8, 0);
+          return std::pair<unsigned, int>{VecIdx, ElemIdx};
         });
 
   default:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 972d9fe3b5e4f..1bfea24b228e8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12090,24 +12090,46 @@ static bool evalShuffleGeneric(
   unsigned ShuffleMask = 0;
   APValue A, MaskVector, B;
   bool IsVectorMask = false;
-
-  QualType Arg2Type = Call->getArg(2)->getType();
-  if (Arg2Type->isVectorType()) {
-    IsVectorMask = true;
-    if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
-        !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
-        !EvaluateAsRValue(Info, Call->getArg(2), B))
-      return false;
-  } else if (Arg2Type->isIntegerType()) {
-    APSInt MaskImm;
-    if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
-      return false;
-    ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
-    if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
-        !EvaluateAsRValue(Info, Call->getArg(1), B))
+  bool IsSingleOperand = (Call->getNumArgs() == 2);
+
+  if (IsSingleOperand) {
+    QualType MaskType = Call->getArg(1)->getType();
+    if (MaskType->isVectorType()) {
+      IsVectorMask = true;
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+          !EvaluateAsRValue(Info, Call->getArg(1), MaskVector))
+        return false;
+      B = A;
+    } else if (MaskType->isIntegerType()) {
+      APSInt MaskImm;
+      if (!EvaluateInteger(Call->getArg(1), MaskImm, Info))
+        return false;
+      ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A))
+        return false;
+      B = A;
+    } else {
       return false;
+    }
   } else {
-    return false;
+    QualType Arg2Type = Call->getArg(2)->getType();
+    if (Arg2Type->isVectorType()) {
+      IsVectorMask = true;
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+          !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
+          !EvaluateAsRValue(Info, Call->getArg(2), B))
+        return false;
+    } else if (Arg2Type->isIntegerType()) {
+      APSInt MaskImm;
+      if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+        return false;
+      ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+      if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+          !EvaluateAsRValue(Info, Call->getArg(1), B))
+        return false;
+    } else {
+      return false;
+    }
   }
 
   unsigned NumElts = VT->getNumElements();
@@ -12124,8 +12146,16 @@ static bool evalShuffleGeneric(
     if (SrcIdx < 0) {
       // Zero out this element
       QualType ElemTy = VT->getElementType();
-      ResultElements.push_back(
-          APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+      if (ElemTy->isRealFloatingType()) {
+        ResultElements.push_back(
+            APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+      } else if (ElemTy->isIntegerType()) {
+        APValue Zero(Info.Ctx.MakeIntValue(0, ElemTy));
+        ResultElements.push_back(APValue(Zero));
+      } else {
+        // Other types of fallback logic
+        ResultElements.push_back(APValue());
+      }
     } else {
       const APValue &Src = (SrcVecIdx == 0) ? A : B;
       ResultElements.push_back(Src.getVectorElt(SrcIdx));
@@ -12136,98 +12166,6 @@ static bool evalShuffleGeneric(
   return true;
 }
 
-static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
-                              APValue &Out) {
-  APValue SrcVec, ControlVec;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
-    return false;
-  if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
-    return false;
-
-  const auto *VT = Call->getType()->getAs<VectorType>();
-  if (!VT)
-    return false;
-
-  QualType ElemT = VT->getElementType();
-  unsigned NumElts = VT->getNumElements();
-
-  SmallVector<APValue, 64> ResultElements;
-  ResultElements.reserve(NumElts);
-
-  for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
-    APValue CtlVal = ControlVec.getVectorElt(Idx);
-    APSInt CtlByte = CtlVal.getInt();
-    uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
-
-    if (Ctl & 0x80) {
-      APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
-      ResultElements.push_back(Zero);
-    } else {
-      unsigned LaneBase = (Idx / 16) * 16;
-      unsigned SrcOffset = Ctl & 0x0F;
-      unsigned SrcIdx = LaneBase + SrcOffset;
-
-      ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
-    }
-  }
-  Out = APValue(ResultElements.data(), ResultElements.size());
-  return true;
-}
-
-static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
-                             bool IsShufHW, APValue &Out) {
-  APValue Vec;
-  APSInt Imm;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), Vec))
-    return false;
-  if (!EvaluateInteger(Call->getArg(1), Imm, Info))
-    return false;
-
-  const auto *VT = Call->getType()->getAs<VectorType>();
-  if (!VT)
-    return false;
-
-  QualType ElemT = VT->getElementType();
-  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
-  unsigned NumElts = VT->getNumElements();
-
-  unsigned LaneBits = 128u;
-  unsigned LaneElts = LaneBits / ElemBits;
-  if (!LaneElts || (NumElts % LaneElts) != 0)
-    return false;
-
-  uint8_t Ctl = static_cast<uint8_t>(Imm.getZExtValue());
-
-  SmallVector<APValue, 32> ResultElements;
-  ResultElements.reserve(NumElts);
-
-  for (unsigned Idx = 0; Idx != NumElts; Idx++) {
-    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
-    unsigned LaneIdx = Idx % LaneElts;
-    unsigned SrcIdx = Idx;
-    unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3;
-
-    if (ElemBits == 32) {
-      SrcIdx = LaneBase + Sel;
-    } else {
-      constexpr unsigned HalfSize = 4;
-      bool InHigh = LaneIdx >= HalfSize;
-      if (!IsShufHW && !InHigh) {
-        SrcIdx = LaneBase + Sel;
-      } else if (IsShufHW && InHigh) {
-        unsigned Rel = LaneIdx - HalfSize;
-        Sel = (Ctl >> (2 * Rel)) & 0x3;
-        SrcIdx = LaneBase + HalfSize + Sel;
-      }
-    }
-
-    ResultElements.push_back(Vec.getVectorElt(SrcIdx));
-  }
-
-  Out = APValue(ResultElements.data(), ResultElements.size());
-  return true;
-}
-
 bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   if (!IsConstantEvaluatedBuiltinCall(E))
     return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -12993,7 +12931,19 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512: {
     APValue R;
-    if (!evalPshufbBuiltin(Info, E, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx,
+               unsigned ShuffleMask) -> std::pair<unsigned, int> {
+              uint8_t Ctlb = static_cast<uint8_t>(ShuffleMask);
+              if (Ctlb & 0x80)
+                return std::make_pair(0, -1);
+
+              unsigned LaneBase = (DstIdx / 16) * 16;
+              unsigned SrcOffset = Ctlb & 0x0F;
+              unsigned SrcIdx = LaneBase + SrcOffset;
+              return std::make_pair(0, static_cast<int>(SrcIdx));
+            }))
       return false;
     return Success(R, E);
   }
@@ -13002,7 +12952,21 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
     APValue R;
-    if (!evalPshufBuiltin(Info, E, false, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              constexpr unsigned LaneBits = 128u;
+              constexpr unsigned ElemBits = 16u;
+              constexpr unsigned LaneElts = LaneBits / ElemBits;
+              constexpr unsigned HalfSize = 4;
+              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
+              unsigned LaneIdx = DstIdx % LaneElts;
+              if (LaneIdx < HalfSize) {
+                unsigned Sel = (Mask >> (2 * LaneIdx)) & 0x3;
+                return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+              }
+              return std::make_pair(0, static_cast<int>(DstIdx));
+            }))
       return false;
     return Success(R, E);
   }
@@ -13011,7 +12975,23 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshufhw256:
   case X86::BI__builtin_ia32_pshufhw512: {
     APValue R;
-    if (!evalPshufBuiltin(Info, E, true, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              constexpr unsigned LaneBits = 128u;
+              constexpr unsigned ElemBits = 16u;
+              constexpr unsigned LaneElts = LaneBits / ElemBits;
+              constexpr unsigned HalfSize = 4;
+              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
+              unsigned LaneIdx = DstIdx % LaneElts;
+              if (LaneIdx >= HalfSize) {
+                unsigned Rel = LaneIdx - HalfSize;
+                unsigned Sel = (Mask >> (2 * Rel)) & 0x3;
+                return std::make_pair(
+                    0, static_cast<int>(LaneBase + HalfSize + Sel));
+              }
+              return std::make_pair(0, static_cast<int>(DstIdx));
+            }))
       return false;
     return Success(R, E);
   }
@@ -13020,7 +13000,17 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pshufd256:
   case X86::BI__builtin_ia32_pshufd512: {
     APValue R;
-    if (!evalPshufBuiltin(Info, E, false, R))
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
+              constexpr unsigned LaneBits = 128u;
+              constexpr unsigned ElemBits = 32u;
+              constexpr unsigned LaneElts = LaneBits / ElemBits;
+              unsigned LaneBase = (DstIdx / LaneElts) * LaneElts;
+              unsigned LaneIdx = DstIdx % LaneElts;
+              unsigned Sel = (Mask >> (2 * LaneIdx)) & 0x3;
+              return std::make_pair(0, static_cast<int>(LaneBase + Sel));
+            }))
       return false;
     return Success(R, E);
   }
@@ -13500,61 +13490,66 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case X86::BI__builtin_ia32_pslldqi128_byteshift:
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift: {
-    assert(E->getNumArgs() == 2);
-
-    APValue Src;
-    APSInt Imm;
-    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
-        !EvaluateInteger(E->getArg(1), Imm, Info))
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+              unsigned LaneBase = (DstIdx / 16) * 16;
+              unsigned LaneIdx = DstIdx % 16;
+              if (LaneIdx < Shift)
+                return std::make_pair(0, -1);
+
+              return std::make_pair(
+                  0, static_cast<int>(LaneBase + LaneIdx - Shift));
+            }))
       return false;
-
-    unsigned VecLen = Src.getVectorLength();
-    unsigned Shift = Imm.getZExtValue() & 0xff;
-
-    SmallVector<APValue> ResultElements;
-    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
-      for (unsigned I = 0; I != 16; ++I) {
-        if (I < Shift) {
-          APSInt Zero(8, /*isUnsigned=*/true);
-          Zero = 0;
-          ResultElements.push_back(APValue(Zero));
-        } else {
-          ResultElements.push_back(Src.getVectorElt(Lane + I - Shift));
-        }
-      }
-    }
-
-    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+    return Success(R, E);
   }
 
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift: {
-    assert(E->getNumArgs() == 2);
-
-    APValue Src;
-    APSInt Imm;
-    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
-        !EvaluateInteger(E->getArg(1), Imm, Info))
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx, unsigned Shift) -> std::pair<unsigned, int> {
+              unsigned LaneBase = (DstIdx / 16) * 16;
+              unsigned LaneIdx = DstIdx % 16;
+              if (LaneIdx + Shift < 16)
+                return std::make_pair(
+                    0, static_cast<int>(LaneBase + LaneIdx + Shift));
+
+              return std::make_pair(0, -1);
+            }))
       return false;
+    return Success(R, E);
+  }
 
-    unsigned VecLen = Src.getVectorLength();
-    unsigned Shift = Imm.getZExtValue() & 0xff;
-
-    SmallVector<APValue> ResultElements;
-    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
-      for (unsigned I = 0; I != 16; ++I) {
-        if (I + Shift < 16) {
-          ResultElements.push_back(Src.getVectorElt(Lane + I + Shift));
-        } else {
-          APSInt Zero(8, /*isUnsigned=*/true);
-          Zero = 0;
-          ResultElements.push_back(APValue(Zero));
-        }
-      }
-    }
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned Shift) {
+          // Default to -1 → zero-fill this destination element
+          unsigned VecIdx = 1;
+          int ElemIdx = -1;
+
+          int Lane = DstIdx / 16;
+          int Offset = DstIdx % 16;
+
+          // Elements come from VecB first, then VecA after the shift boundary
+          unsigned ShiftedIdx = Offset + (Shift & 0xFF);
+          if (ShiftedIdx < 16) { // from VecB
+            ElemIdx = ShiftedIdx + (Lane * 16);
+          } else if (ShiftedIdx < 32) { // from VecA
+            VecIdx = 0;
+            ElemIdx = (ShiftedIdx - 16) + (Lane * 16);
+          }
 
-    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+          return std::pair<unsigned, int>{VecIdx, ElemIdx};
+        }))
+      return false;
+    return Success(R, E);
   }
   case X86::BI__builtin_ia32_vpermi2varq128:
   case X86::BI__builtin_ia32_vpermi2varpd128: {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 5eba5ba6c3df1..c1a36134d8942 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -438,6 +438,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return cgf.emitVAArg(ve);
   }
 
+  mlir::Value VisitCXXRewrittenBinaryOperator(CXXRewrittenBinaryOperator *e) {
+    return Visit(e->getSemanticForm());
+  }
+
   mlir::Value VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *e);
   mlir::Value
   VisitAbstractConditionalOperator(const AbstractConditionalOperator *e);
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index fbecab9774f5b..2ef09b74dc968 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -26,6 +26,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_CIRCANONICALIZE
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
 /// Removes branches between two blocks if it is the only branch.
@@ -101,7 +106,8 @@ struct RemoveEmptySwitch : public OpRewritePattern<SwitchOp> {
 // CIRCanonicalizePass
 //===----------------------------------------------------------------------===//
 
-struct CIRCanonicalizePass : public CIRCanonicalizeBase<CIRCanonicalizePass> {
+struct CIRCanonicalizePass
+    : public impl::CIRCanonicalizeBase<CIRCanonicalizePass> {
   using CIRCanonicalizeBase::CIRCanonicalizeBase;
 
   // The same operation rewriting done here could have been performed
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
index 3c6f76892d5cb..dcef9ddee1bb4 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
@@ -21,6 +21,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_CIRSIMPLIFY
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // Rewrite patterns
 //===----------------------------------------------------------------------===//
@@ -283,7 +288,7 @@ struct SimplifyVecSplat : public OpRewritePattern<VecSplatOp> {
 // CIRSimplifyPass
 //===----------------------------------------------------------------------===//
 
-struct CIRSimplifyPass : public CIRSimplifyBase<CIRSimplifyPass> {
+struct CIRSimplifyPass : public impl::CIRSimplifyBase<CIRSimplifyPass> {
   using CIRSimplifyBase::CIRSimplifyBase;
 
   void runOnOperation() override;
diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
index ca7554e4e3754..69a5334ca2423 100644
--- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
@@ -26,6 +26,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_CIRFLATTENCFG
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
 /// Lowers operations with the terminator trait that have a single successor.
@@ -50,7 +55,7 @@ void walkRegionSkipping(
   });
 }
 
-struct CIRFlattenCFGPass : public CIRFlattenCFGBase<CIRFlattenCFGPass> {
+struct CIRFlattenCFGPass : public impl::CIRFlattenCFGBase<CIRFlattenCFGPass> {
 
   CIRFlattenCFGPass() = default;
   void runOnOperation() override;
diff --git a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
index c0db98440a902..00972b6976295 100644
--- a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
@@ -14,9 +14,14 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_GOTOSOLVER
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
-struct GotoSolverPass : public GotoSolverBase<GotoSolverPass> {
+struct GotoSolverPass : public impl::GotoSolverBase<GotoSolverPass> {
   GotoSolverPass() = default;
   void runOnOperation() override;
 };
diff --git a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
index 72bbf08c79b16..74b22faadc8ae 100644
--- a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
@@ -20,9 +20,14 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_HOISTALLOCAS
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 namespace {
 
-struct HoistAllocasPass : public HoistAllocasBase<HoistAllocasPass> {
+struct HoistAllocasPass : public impl::HoistAllocasBase<HoistAllocasPass> {
 
   HoistAllocasPass() = default;
   void runOnOperation() override;
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index cba04649ca05e..29b1211d2c351 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -23,6 +23,11 @@
 using namespace mlir;
 using namespace cir;
 
+namespace mlir {
+#define GEN_PASS_DEF_LOWERINGPREPARE
+#include "clang/CIR/Dialect/Passes.h.inc"
+} // namespace mlir
+
 static SmallString<128> getTransformedFileName(mlir::ModuleOp mlirModule) {
   SmallString<128> fileName;
 
@@ -53,7 +58,8 @@ static cir::FuncOp getCalledFunction(cir::CallOp callOp) {
 }
 
 namespace {
-struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
+struct LoweringPreparePass
+    : public impl::LoweringPrepareBase<LoweringPreparePass> {
   LoweringPreparePass() = default;
   void runOnOperation() override;
 
diff --git a/clang/lib/CIR/Dialect/Transforms/PassDetail.h b/clang/lib/CIR/Dialect/Transforms/PassDetail.h
index 600dde56d679f..ef42a85cc2751 100644
--- a/clang/lib/CIR/Dialect/Transforms/PassDetail.h
+++ b/clang/lib/CIR/Dialect/Transforms/PassDetail.h
@@ -21,7 +21,7 @@ namespace mlir {
 template <typename ConcreteDialect>
 void registerDialect(DialectRegistry &registry);
 
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DECL
 #include "clang/CIR/Dialect/Passes.h.inc"
 
 } // namespace mlir
diff --git a/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
new file mode 100644
index 0000000000000..ac4cac429cb0f
--- /dev/null
+++ b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct HasOpEq {
+  bool operator==(const HasOpEq &) const;
+};
+
+void cxx_rewritten_binary_operator_scalar_expr() {
+  HasOpEq a;
+  HasOpEq b;
+  bool neq = a != b;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr<!rec_HasOpEq>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr<!rec_HasOpEq>, ["b"]
+// CIR: %[[NEQ_ADDR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["neq", init]
+// CIR: %[[EQ:.*]] = cir.call @_ZNK7HasOpEqeqERKS_(%[[A_ADDR]], %[[B_ADDR]]) : (!cir.ptr<!rec_HasOpEq>, !cir.ptr<!rec_HasOpEq>) -> !cir.bool
+// CIR: %[[NEQ:.*]] = cir.unary(not, %[[EQ]]) : !cir.bool, !cir.bool
+// CIR: cir.store{{.*}} %[[NEQ]], %[[NEQ_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
+
+// LLVM: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
+// LLVM: %[[B_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
+// LLVM: %[[NEQ_ADDR:.*]] = alloca i8, i64 1, align 1
+// LLVM: %[[EQ:.*]] = call i1 @_ZNK7HasOpEqeqERKS_(ptr %[[A_ADDR]], ptr %[[B_ADDR]])
+// LLVM: %[[NEQ_I1:.*]] = xor i1 %[[EQ]], true
+// LLVM: %[[NEQ:.*]] = zext i1 %[[NEQ_I1]] to i8
+// LLVM: store i8 %[[NEQ]], ptr %[[NEQ_ADDR]], align 1
+
+// OGCG: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, align 1
+// OGCG: %[[B_ADDR:.*]] = alloca %struct.HasOpEq, align 1
+// OGCG: %[[NEQ_ADDR:.*]] = alloca i8, align 1
+// OGCG: %[[EQ:.*]] = call {{.*}} zeroext i1 @_ZNK7HasOpEqeqERKS_(ptr {{.*}} %[[A_ADDR]], ptr {{.*}} %[[B_ADDR]])
+// OGCG: %[[NEQ_I1:.*]] = xor i1 %[[EQ]], true
+// OGCG: %[[NEQ:.*]] = zext i1 %[[NEQ_I1]] to i8
+// OGCG: store i8 %[[NEQ]], ptr %[[NEQ_ADDR]], align 1
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index de4cb2fd0b055..ce8e2f04e487c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -109,6 +109,9 @@ __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
   return _mm256_alignr_epi8(a, b, 2);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 1, 2, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 17, 18));
+TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32));
+TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test2_mm256_alignr_epi8
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 0b73c7b14d869..2749dc5741b58 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -3057,6 +3057,9 @@ __m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
     // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
     return _mm512_alignr_epi8(__A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 1, 2, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 17, 18, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 33, 34, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127, 49, 50));
+TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 16), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64));
+TEST_CONSTEXPR(match_v64qi(_mm512_alignr_epi8(((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m512i __B){
     // CHECK-LABEL: test_mm512_mask_alignr_epi8
@@ -3064,6 +3067,7 @@ __m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m5
     // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
     return _mm512_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_alignr_epi8(((__m512i)(__v64qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask64)0x000000000000000f, ((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127));
 
 __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
     // CHECK-LABEL: test_mm512_maskz_alignr_epi8
@@ -3071,6 +3075,7 @@ __m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
     // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
    return _mm512_maskz_alignr_epi8(__U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_alignr_epi8((__mmask64)0x000000000000000f, ((__m512i)(__v64qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), ((__m512i)(__v64qs){65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 127}), 2), 67, 68, 69, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 
 
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 28e6afbc24564..7a5af2dc8742f 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3538,6 +3538,7 @@ __m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_alignr_epi8(((__m128i)(__v16qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask16)0x000f, ((__m128i)(__v16qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qs){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 2), 19, 20, 21, 22, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127));
 
 __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_alignr_epi8
@@ -3545,6 +3546,7 @@ __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_alignr_epi8(__U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v16qi( _mm_maskz_alignr_epi8((__mmask16)0x000f, ((__m128i)(__v16qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qs){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}),2), 19, 20, 21, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_alignr_epi8
@@ -3552,6 +3554,7 @@ __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_alignr_epi8(__W, __U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_alignr_epi8(((__m256i)(__v32qs){127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}), (__mmask32)0xf000000f, ((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 63, 64, 17, 18));
 
 __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_alignr_epi8
@@ -3559,6 +3562,7 @@ __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_alignr_epi8(__U, __A, __B, 2); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_alignr_epi8((__mmask32)0xf000000f, ((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 64, 17, 18));
 
 __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dbsad_epu8
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 273138063a1b1..ad8a81c61ad43 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -102,6 +102,8 @@ __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
   // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_pi8(a, b, 2);
 }
+TEST_CONSTEXPR(match_v8qi(_mm_alignr_pi8(((__m64)(__v8qs){1, 2, 3, 4, 5, 6, 7, 8}), ((__m64)(__v8qs){9, 10, 11, 12, 13, 14, 15, 16}), 2), 11, 12, 13, 14, 15, 16, 1, 2));
+TEST_CONSTEXPR(match_v8qi(_mm_alignr_pi8(((__m64)(__v8qs){1, 2, 3, 4, 5, 6, 7, 8}), ((__m64)(__v8qs){9, 10, 11, 12, 13, 14, 15, 16}), 16), 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m64 test_mm_and_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_and_si64
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index b7a4a2fe7ccd7..193fa37f65d14 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -48,6 +48,8 @@ __m128i test_mm_alignr_epi8(__m128i a, __m128i b) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_epi8(a, b, 2);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_alignr_epi8(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qi){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 2), 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2));
+TEST_CONSTEXPR(match_v16qi(_mm_alignr_epi8(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m128i)(__v16qi){17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 32), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test2_mm_alignr_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test2_mm_alignr_epi8
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 7f64d230f7348..4248e3a5461f5 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1110,6 +1110,9 @@ bool IsArraySection(const Expr<SomeType> &expr);
 // Predicate: does an expression contain constant?
 bool HasConstant(const Expr<SomeType> &);
 
+// Predicate: Does an expression contain a component
+bool HasStructureComponent(const Expr<SomeType> &expr);
+
 // Utilities for attaching the location of the declaration of a symbol
 // of interest to a message.  Handles the case of USE association gracefully.
 parser::Message *AttachDeclaration(parser::Message &, const Symbol &);
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 1c4d2daef2a11..353260b2e5c02 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -588,6 +588,8 @@ class ParseTreeDumper {
   NODE(parser, OmpExpectation)
   NODE_ENUM(OmpExpectation, Value)
   NODE(parser, OmpFailClause)
+  NODE(parser, OmpFallbackModifier)
+  NODE_ENUM(OmpFallbackModifier, Value)
   NODE(parser, OmpFromClause)
   NODE(OmpFromClause, Modifier)
   NODE(parser, OmpGrainsizeClause)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index cd9429b9631d6..2f6b95b2fa2a8 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4000,6 +4000,17 @@ struct OmpExpectation {
   WRAPPER_CLASS_BOILERPLATE(OmpExpectation, Value);
 };
 
+// Ref: [6.1:tbd]
+//
+// fallback-modifier ->
+//    FALLBACK(fallback-mode)                       // since 6.1
+// fallback-mode ->
+//    ABORT | DEFAULT_MEM | NULL                    // since 6.1
+struct OmpFallbackModifier {
+  ENUM_CLASS(Value, Abort, Default_Mem, Null);
+  WRAPPER_CLASS_BOILERPLATE(OmpFallbackModifier, Value);
+};
+
 // REF: [5.1:217-220], [5.2:293-294]
 //
 // OmpInteropRuntimeIdentifier ->                   // since 5.2
@@ -4129,9 +4140,8 @@ struct OmpOrderModifier {
 //
 // prescriptiveness ->
 //    STRICT                                        // since 5.1
-//    FALLBACK                                      // since 6.1
 struct OmpPrescriptiveness {
-  ENUM_CLASS(Value, Strict, Fallback)
+  ENUM_CLASS(Value, Strict)
   WRAPPER_CLASS_BOILERPLATE(OmpPrescriptiveness, Value);
 };
 
@@ -4512,7 +4522,7 @@ struct OmpDynamicAllocatorsClause {
 
 struct OmpDynGroupprivateClause {
   TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause);
-  MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness);
+  MODIFIER_BOILERPLATE(OmpAccessGroup, OmpFallbackModifier);
   std::tuple<MODIFIERS(), ScalarIntExpr> t;
 };
 
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h
index bfa3aa4939cb1..283bf2a4c895e 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -67,6 +67,7 @@ template <typename SpecificTy> const OmpModifierDescriptor &OmpGetDescriptor();
 #define DECLARE_DESCRIPTOR(name) \
   template <> const OmpModifierDescriptor &OmpGetDescriptor<name>()
 
+DECLARE_DESCRIPTOR(parser::OmpAccessGroup);
 DECLARE_DESCRIPTOR(parser::OmpAlignment);
 DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
@@ -82,6 +83,7 @@ DECLARE_DESCRIPTOR(parser::OmpDependenceType);
 DECLARE_DESCRIPTOR(parser::OmpDeviceModifier);
 DECLARE_DESCRIPTOR(parser::OmpDirectiveNameModifier);
 DECLARE_DESCRIPTOR(parser::OmpExpectation);
+DECLARE_DESCRIPTOR(parser::OmpFallbackModifier);
 DECLARE_DESCRIPTOR(parser::OmpInteropPreference);
 DECLARE_DESCRIPTOR(parser::OmpInteropType);
 DECLARE_DESCRIPTOR(parser::OmpIterator);
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index bd06acc21e47f..117b2249a9179 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1210,6 +1210,20 @@ bool HasConstant(const Expr<SomeType> &expr) {
   return HasConstantHelper{}(expr);
 }
 
+// HasStructureComponent()
+struct HasStructureComponentHelper
+    : public AnyTraverse<HasStructureComponentHelper, bool, false> {
+  using Base = AnyTraverse<HasStructureComponentHelper, bool, false>;
+  HasStructureComponentHelper() : Base(*this) {}
+  using Base::operator();
+
+  bool operator()(const Component &) const { return true; }
+};
+
+bool HasStructureComponent(const Expr<SomeType> &expr) {
+  return HasStructureComponentHelper{}(expr);
+}
+
 parser::Message *AttachDeclaration(
     parser::Message &message, const Symbol &symbol) {
   const Symbol *unhosted{&symbol};
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index ce05ab22c880f..9cdd46137adbf 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -406,6 +406,11 @@ bool ClauseProcessor::processMergeable(
   return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable);
 }
 
+bool ClauseProcessor::processNogroup(
+    mlir::omp::NogroupClauseOps &result) const {
+  return markClauseOccurrence<omp::clause::Nogroup>(result.nogroup);
+}
+
 bool ClauseProcessor::processNowait(mlir::omp::NowaitClauseOps &result) const {
   return markClauseOccurrence<omp::clause::Nowait>(result.nowait);
 }
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index fecf3ca4af9dd..00cf9bbe4c48b 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -89,6 +89,7 @@ class ClauseProcessor {
   bool processInclusive(mlir::Location currentLocation,
                         mlir::omp::InclusiveClauseOps &result) const;
   bool processMergeable(mlir::omp::MergeableClauseOps &result) const;
+  bool processNogroup(mlir::omp::NogroupClauseOps &result) const;
   bool processNowait(mlir::omp::NowaitClauseOps &result) const;
   bool processNumTasks(lower::StatementContext &stmtCtx,
                        mlir::omp::NumTasksClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 2575b7081039b..002b7c1888e73 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -396,8 +396,6 @@ makePrescriptiveness(parser::OmpPrescriptiveness::Value v) {
   switch (v) {
   case parser::OmpPrescriptiveness::Value::Strict:
     return clause::Prescriptiveness::Strict;
-  case parser::OmpPrescriptiveness::Value::Fallback:
-    return clause::Prescriptiveness::Fallback;
   }
   llvm_unreachable("Unexpected prescriptiveness");
 }
@@ -799,21 +797,31 @@ DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp,
                      semantics::SemanticsContext &semaCtx) {
   // imp.v -> OmpDyngroupprivateClause
   CLAUSET_ENUM_CONVERT( //
-      convert, parser::OmpAccessGroup::Value, DynGroupprivate::AccessGroup,
+      makeAccessGroup, parser::OmpAccessGroup::Value,
+      DynGroupprivate::AccessGroup,
       // clang-format off
       MS(Cgroup,  Cgroup)
       // clang-format on
   );
 
+  CLAUSET_ENUM_CONVERT( //
+      makeFallback, parser::OmpFallbackModifier::Value,
+      DynGroupprivate::Fallback,
+      // clang-format off
+      MS(Abort,       Abort)
+      MS(Default_Mem, Default_Mem)
+      MS(Null,        Null)
+      // clang-format on
+  );
+
   auto &mods = semantics::OmpGetModifiers(inp.v);
   auto *m0 = semantics::OmpGetUniqueModifier<parser::OmpAccessGroup>(mods);
-  auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpPrescriptiveness>(mods);
+  auto *m1 = semantics::OmpGetUniqueModifier<parser::OmpFallbackModifier>(mods);
   auto &size = std::get<parser::ScalarIntExpr>(inp.v.t);
 
-  return DynGroupprivate{
-      {/*AccessGroup=*/maybeApplyToV(convert, m0),
-       /*Prescriptiveness=*/maybeApplyToV(makePrescriptiveness, m1),
-       /*Size=*/makeExpr(size, semaCtx)}};
+  return DynGroupprivate{{/*AccessGroup=*/maybeApplyToV(makeAccessGroup, m0),
+                          /*Fallback=*/maybeApplyToV(makeFallback, m1),
+                          /*Size=*/makeExpr(size, semaCtx)}};
 }
 
 Enter make(const parser::OmpClause::Enter &inp,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index b0dec20ca3a1f..472273e6e16e1 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -347,7 +347,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   if (!hasLastPrivate)
     return;
 
-  if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op)) {
+  if (mlir::isa<mlir::omp::WsloopOp>(op) || mlir::isa<mlir::omp::SimdOp>(op) ||
+      mlir::isa<mlir::omp::TaskloopOp>(op)) {
     mlir::omp::LoopRelatedClauseOps result;
     llvm::SmallVector<const semantics::Symbol *> iv;
     collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval,
@@ -413,7 +414,7 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   } else {
     TODO(converter.getCurrentLocation(),
          "lastprivate clause in constructs other than "
-         "simd/worksharing-loop");
+         "simd/worksharing-loop/taskloop");
   }
 }
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ea1b53fb5185f..d1366443c1647 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1770,25 +1770,27 @@ static void genTaskgroupClauses(
   cp.processTaskReduction(loc, clauseOps, taskReductionSyms);
 }
 
-static void genTaskloopClauses(lower::AbstractConverter &converter,
-                               semantics::SemanticsContext &semaCtx,
-                               lower::StatementContext &stmtCtx,
-                               const List<Clause> &clauses, mlir::Location loc,
-                               mlir::omp::TaskloopOperands &clauseOps) {
+static void genTaskloopClauses(
+    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
+    lower::StatementContext &stmtCtx, const List<Clause> &clauses,
+    mlir::Location loc, mlir::omp::TaskloopOperands &clauseOps,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &inReductionSyms) {
 
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processFinal(stmtCtx, clauseOps);
   cp.processGrainsize(stmtCtx, clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_taskloop, clauseOps);
+  cp.processInReduction(loc, clauseOps, inReductionSyms);
   cp.processMergeable(clauseOps);
+  cp.processNogroup(clauseOps);
   cp.processNumTasks(stmtCtx, clauseOps);
   cp.processPriority(stmtCtx, clauseOps);
+  cp.processReduction(loc, clauseOps, reductionSyms);
   cp.processUntied(clauseOps);
 
-  cp.processTODO<clause::Collapse, clause::InReduction, clause::Lastprivate,
-                 clause::Nogroup, clause::Reduction>(
-      loc, llvm::omp::Directive::OMPD_taskloop);
+  cp.processTODO<clause::Collapse>(loc, llvm::omp::Directive::OMPD_taskloop);
 }
 
 static void genTaskwaitClauses(lower::AbstractConverter &converter,
@@ -2999,8 +3001,11 @@ static mlir::omp::TaskloopOp genStandaloneTaskloop(
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
   mlir::omp::TaskloopOperands taskloopClauseOps;
+  llvm::SmallVector<const semantics::Symbol *> reductionSyms;
+  llvm::SmallVector<const semantics::Symbol *> inReductionSyms;
+
   genTaskloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                     taskloopClauseOps);
+                     taskloopClauseOps, reductionSyms, inReductionSyms);
   DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*shouldCollectPreDeterminedSymbols=*/true,
                            enableDelayedPrivatization, symTable);
@@ -3015,6 +3020,10 @@ static mlir::omp::TaskloopOp genStandaloneTaskloop(
   EntryBlockArgs taskloopArgs;
   taskloopArgs.priv.syms = dsp.getDelayedPrivSymbols();
   taskloopArgs.priv.vars = taskloopClauseOps.privateVars;
+  taskloopArgs.reduction.syms = reductionSyms;
+  taskloopArgs.reduction.vars = taskloopClauseOps.reductionVars;
+  taskloopArgs.inReduction.syms = inReductionSyms;
+  taskloopArgs.inReduction.vars = taskloopClauseOps.inReductionVars;
 
   auto taskLoopOp = genWrapperOp<mlir::omp::TaskloopOp>(
       converter, loc, taskloopClauseOps, taskloopArgs);
diff --git a/flang/lib/Optimizer/CodeGen/PassDetail.h b/flang/lib/Optimizer/CodeGen/PassDetail.h
index f7030131beff9..252da029dc0c8 100644
--- a/flang/lib/Optimizer/CodeGen/PassDetail.h
+++ b/flang/lib/Optimizer/CodeGen/PassDetail.h
@@ -18,7 +18,7 @@
 
 namespace fir {
 
-#define GEN_PASS_CLASSES
+#define GEN_PASS_DECL
 #include "flang/Optimizer/CodeGen/CGPasses.h.inc"
 
 } // namespace fir
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 4374acbbe51bf..e2da60ed19de8 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -791,6 +791,12 @@ TYPE_PARSER(construct<OmpDirectiveNameModifier>(OmpDirectiveNameParser{}))
 TYPE_PARSER(construct<OmpExpectation>( //
     "PRESENT" >> pure(OmpExpectation::Value::Present)))
 
+TYPE_PARSER(construct<OmpFallbackModifier>("FALLBACK"_tok >>
+    parenthesized( //
+        "ABORT" >> pure(OmpFallbackModifier::Value::Abort) ||
+        "DEFAULT_MEM" >> pure(OmpFallbackModifier::Value::Default_Mem) ||
+        "NULL" >> pure(OmpFallbackModifier::Value::Null))))
+
 TYPE_PARSER(construct<OmpInteropRuntimeIdentifier>(
     construct<OmpInteropRuntimeIdentifier>(charLiteralConstant) ||
     construct<OmpInteropRuntimeIdentifier>(scalarIntConstantExpr)))
@@ -857,8 +863,7 @@ TYPE_PARSER(construct<OmpOrderingModifier>(
     "SIMD" >> pure(OmpOrderingModifier::Value::Simd)))
 
 TYPE_PARSER(construct<OmpPrescriptiveness>(
-    "STRICT" >> pure(OmpPrescriptiveness::Value::Strict) ||
-    "FALLBACK" >> pure(OmpPrescriptiveness::Value::Fallback)))
+    "STRICT" >> pure(OmpPrescriptiveness::Value::Strict)))
 
 TYPE_PARSER(construct<OmpPresentModifier>( //
     "PRESENT" >> pure(OmpPresentModifier::Value::Present)))
@@ -925,7 +930,7 @@ TYPE_PARSER( //
     sourced(construct<OmpDynGroupprivateClause::Modifier>(
         Parser<OmpAccessGroup>{})) ||
     sourced(construct<OmpDynGroupprivateClause::Modifier>(
-        Parser<OmpPrescriptiveness>{})))
+        Parser<OmpFallbackModifier>{})))
 
 TYPE_PARSER(
     sourced(construct<OmpDeviceClause::Modifier>(Parser<OmpDeviceModifier>{})))
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index dc0f083c9fc95..53e74298f96ac 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2286,6 +2286,11 @@ class UnparseVisitor {
     Walk(std::get<OmpObjectList>(x.t));
     Walk(": ", std::get<std::optional<std::list<Modifier>>>(x.t));
   }
+  void Unparse(const OmpFallbackModifier &x) {
+    Word("FALLBACK(");
+    Walk(x.v);
+    Put(")");
+  }
   void Unparse(const OmpDynGroupprivateClause &x) {
     using Modifier = OmpDynGroupprivateClause::Modifier;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
@@ -2795,6 +2800,7 @@ class UnparseVisitor {
       OmpDeviceTypeClause, DeviceTypeDescription) // OMP device_type
   WALK_NESTED_ENUM(OmpReductionModifier, Value) // OMP reduction-modifier
   WALK_NESTED_ENUM(OmpExpectation, Value) // OMP motion-expectation
+  WALK_NESTED_ENUM(OmpFallbackModifier, Value) // OMP fallback-modifier
   WALK_NESTED_ENUM(OmpInteropType, Value) // OMP InteropType
   WALK_NESTED_ENUM(OmpOrderClause, Ordering) // OMP ordering
   WALK_NESTED_ENUM(OmpOrderModifier, Value) // OMP order-modifier
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 4bd4fe79763d8..eaf1faac5e2e2 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -682,6 +682,13 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) {
   }
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::DynGroupprivate &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_dyn_groupprivate);
+  parser::CharBlock source{GetContext().clauseSource};
+
+  OmpVerifyModifiers(x.v, llvm::omp::OMPC_dyn_groupprivate, source, context_);
+}
+
 void OmpStructureChecker::Enter(const parser::OmpDirectiveSpecification &x) {
   // OmpDirectiveSpecification exists on its own only in METADIRECTIVE.
   // In other cases it's a part of other constructs that handle directive
@@ -3332,6 +3339,32 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) {
     }
   }
 
+  // Default access-group for DYN_GROUPPRIVATE is "cgroup". On a given
+  // construct there can be at most one DYN_GROUPPRIVATE with a given
+  // access-group.
+  const parser::OmpClause
+      *accGrpClause[parser::OmpAccessGroup::Value_enumSize] = {nullptr};
+  for (auto [_, clause] :
+      FindClauses(llvm::omp::Clause::OMPC_dyn_groupprivate)) {
+    auto &wrapper{std::get<parser::OmpClause::DynGroupprivate>(clause->u)};
+    auto &modifiers{OmpGetModifiers(wrapper.v)};
+    auto accGrp{parser::OmpAccessGroup::Value::Cgroup};
+    if (auto *ag{OmpGetUniqueModifier<parser::OmpAccessGroup>(modifiers)}) {
+      accGrp = ag->v;
+    }
+    auto &firstClause{accGrpClause[llvm::to_underlying(accGrp)]};
+    if (firstClause) {
+      context_
+          .Say(clause->source,
+              "The access-group modifier can only occur on a single clause in a construct"_err_en_US)
+          .Attach(firstClause->source,
+              "Previous clause with access-group modifier"_en_US);
+      break;
+    } else {
+      firstClause = clause;
+    }
+  }
+
   CheckRequireAtLeastOneOf();
 }
 
@@ -4689,10 +4722,12 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Copyin &x) {
 void OmpStructureChecker::CheckStructureComponent(
     const parser::OmpObjectList &objects, llvm::omp::Clause clauseId) {
   auto CheckComponent{[&](const parser::Designator &designator) {
-    if (auto *dataRef{std::get_if<parser::DataRef>(&designator.u)}) {
+    if (const parser::DataRef *dataRef{
+            std::get_if<parser::DataRef>(&designator.u)}) {
       if (!IsDataRefTypeParamInquiry(dataRef)) {
-        if (auto *comp{parser::Unwrap<parser::StructureComponent>(*dataRef)}) {
-          context_.Say(comp->component.source,
+        const auto expr{AnalyzeExpr(context_, designator)};
+        if (expr.has_value() && evaluate::HasStructureComponent(expr.value())) {
+          context_.Say(designator.source,
               "A variable that is part of another variable cannot appear on the %s clause"_err_en_US,
               parser::ToUpperCaseLetters(getClauseName(clauseId).str()));
         }
@@ -5489,7 +5524,6 @@ CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
 CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
 CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
 CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
-CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
 CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
 CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
 CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp
index 717fb0351ba5b..f191b4de2d579 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -74,6 +74,22 @@ unsigned OmpModifierDescriptor::since(llvm::omp::Clause id) const {
 // Note: The intent for these functions is to have them be automatically-
 // generated in the future.
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAccessGroup>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"access-group",
+      /*props=*/
+      {
+          {61, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {61, {Clause::OMPC_dyn_groupprivate}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAlignment>() {
   static const OmpModifierDescriptor desc{
@@ -321,6 +337,22 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpExpectation>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpFallbackModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"fallback-modifier",
+      /*props=*/
+      {
+          {61, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {61, {Clause::OMPC_dyn_groupprivate}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpInteropPreference>() {
   static const OmpModifierDescriptor desc{
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90
deleted file mode 100644
index 8acc399a92abe..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-inreduction.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASKLOOP construct
-subroutine omp_taskloop_inreduction()
-   integer x
-   x = 0
-   !$omp taskloop in_reduction(+:x)
-   do i = 1, 100
-      x = x + 1
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_inreduction
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90
deleted file mode 100644
index 54f2580daf283..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause LASTPRIVATE in TASKLOOP construct
-subroutine omp_taskloop_lastprivate()
-   integer x
-   x = 0
-   !$omp taskloop lastprivate(x)
-   do i = 1, 100
-      x = x + 1
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_lastprivate
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90
deleted file mode 100644
index 2a0c5985290e2..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause NOGROUP in TASKLOOP construct
-subroutine omp_taskloop_nogroup()
-   integer x
-   x = 0
-   !$omp taskloop nogroup
-   do i = 1, 100
-      x = x + 1
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_nogroup
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90
deleted file mode 100644
index 0c16bd227257f..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/taskloop-reduction.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-! CHECK: not yet implemented: Unhandled clause REDUCTION in TASKLOOP construct
-subroutine omp_taskloop_reduction()
-   integer x
-   x = 0
-   !$omp taskloop reduction(+:x)
-   do i = 1, 100
-      x = x + 1
-   end do
-   !$omp end taskloop
-end subroutine omp_taskloop_reduction
diff --git a/flang/test/Lower/OpenMP/taskloop.f90 b/flang/test/Lower/OpenMP/taskloop.f90
index 4a06e4def0c83..d23eef2d4ac2d 100644
--- a/flang/test/Lower/OpenMP/taskloop.f90
+++ b/flang/test/Lower/OpenMP/taskloop.f90
@@ -2,6 +2,15 @@
 ! RUN: bbc -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s
 
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[LAST_PRIVATE_I:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[LAST_PRIVATE_X:.*]] : i32
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:       {type = private} @[[QFOMP_TASKLOOP_NOGROUPEI_PRIVATE_I32:.*]] : i32
+
 ! CHECK-LABEL:  omp.private
 ! CHECK-SAME:       {type = private} @[[OMP_TASKLOOP_UNTIEDEI_PRIVATE_I32:.*]] : i32
 
@@ -195,3 +204,46 @@ subroutine omp_taskloop_untied()
   end do
   !$omp end taskloop
 end subroutine
+
+!===============================================================================
+! `nogroup` clause
+!===============================================================================
+
+subroutine omp_taskloop_nogroup()
+  ! CHECK: omp.taskloop nogroup
+  !$omp taskloop nogroup
+  do i = 1, 10
+    call foo()
+  end do
+  !$omp end taskloop
+end subroutine
+
+!===============================================================================
+! `lastprivate` clause
+!===============================================================================
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_lastprivate
+! CHECK:          %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_lastprivateEi"}
+! CHECK:          %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_lastprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_lastprivateEx"}
+! CHECK:          %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_lastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+subroutine omp_taskloop_lastprivate()
+   integer x
+   x = 0
+   ! CHECK:  omp.taskloop private(@[[LAST_PRIVATE_X]] %[[DECL_X]]#0 -> %[[ARG0]], @[[LAST_PRIVATE_I]] %[[DECL_I]]#0 -> %[[ARG1]] : !fir.ref<i32>, !fir.ref<i32>) {
+   !$omp taskloop lastprivate(x)
+   do i = 1, 100
+      ! CHECK: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_taskloop_lastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+      ! CHECK: %[[LOAD_ARG0:.*]] = fir.load %[[DECL_ARG0]]#0 : !fir.ref<i32>
+      ! CHECK: %[[RES_ADD:.*]] = arith.addi %[[LOAD_ARG0]], %{{.*}} : i32
+      ! CHECK:  hlfir.assign %[[RES_ADD]] to %[[DECL_ARG0]]#0 : i32, !fir.ref<i32>
+      x = x + 1
+      ! CHECK:  %[[SELCT_RESULT:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : i1
+      ! CHECK:  fir.if %[[SELCT_RESULT]] {
+      ! CHECK:    %[[LOADED_SUM:.*]] = fir.load %[[DECL_ARG0]]#0 : !fir.ref<i32>
+      ! CHECK:    hlfir.assign %[[LOADED_SUM]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+      ! CHECK:  }
+      ! CHECK:  omp.yield
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_lastprivate
diff --git a/flang/test/Lower/taskloop-inreduction.f90 b/flang/test/Lower/taskloop-inreduction.f90
new file mode 100644
index 0000000000000..e7d3f96115fbd
--- /dev/null
+++ b/flang/test/Lower/taskloop-inreduction.f90
@@ -0,0 +1,40 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:        {type = private} @[[PRIVATE_I:.*]] : i32
+
+! CHECK-LABEL:  omp.declare_reduction 
+! CHECK-SAME:   @[[ADD_RED_I32:.*]] : i32 init {
+! CHECK:       ^bb0(%{{.*}}: i32):
+! CHECK:        %[[C0_I32:.*]] = arith.constant 0 : i32
+! CHECK:        omp.yield(%[[C0_I32]] : i32)
+! CHECK:     } combiner {
+! CHECK:     ^bb0(%{{.*}}: i32, %{{.*}}: i32):
+! CHECK:        %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK:        omp.yield(%[[RES]] : i32)
+! CHECK:     }
+
+! CHECK-LABEL: func.func @_QPomp_taskloop_inreduction
+! CHECK:          %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_inreductionEi"}
+! CHECK:          %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_inreductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_inreductionEx"}
+! CHECK:          %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_inreductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[INIT_X:.*]] = arith.constant 0 : i32
+! CHECK:          hlfir.assign %[[INIT_X]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+subroutine omp_taskloop_inreduction()
+   integer x
+   x = 0
+   ! CHECK:        omp.taskloop in_reduction(@[[ADD_RED_I32]] 
+   ! CHECK:        %[[DECL_X]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) private(@[[PRIVATE_I]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) {
+   ! CHECK:        %[[VAL_ARG1:.*]]:2 = hlfir.declare %[[ARG0]] 
+   ! CHECK-SAME:   {uniq_name = "_QFomp_taskloop_inreductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+   !$omp taskloop in_reduction(+:x)
+   do i = 1, 100
+      ! CHECK: %[[X_VAL:.*]] = fir.load %[[VAL_ARG1]]#0 : !fir.ref<i32>
+      ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[X_VAL]], %{{.*}} : i32
+      x = x + 1
+      ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[VAL_ARG1]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_inreduction
diff --git a/flang/test/Lower/taskloop-reduction.f90 b/flang/test/Lower/taskloop-reduction.f90
new file mode 100644
index 0000000000000..e45c0181bcc8b
--- /dev/null
+++ b/flang/test/Lower/taskloop-reduction.f90
@@ -0,0 +1,39 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! CHECK-LABEL:  omp.private
+! CHECK-SAME:        {type = private} @[[PRIVATE_I:.*]] : i32
+
+! CHECK-LABEL:  omp.declare_reduction 
+! CHECK-SAME:   @[[ADD_RED_I32:.*]] : i32 init {
+! CHECK:       ^bb0(%{{.*}}: i32):
+! CHECK:        %[[C0_I32:.*]] = arith.constant 0 : i32
+! CHECK:        omp.yield(%[[C0_I32]] : i32)
+! CHECK:     } combiner {
+! CHECK:     ^bb0(%{{.*}}: i32, %{{.*}}: i32):
+! CHECK:        %[[RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+! CHECK:        omp.yield(%[[RES]] : i32)
+! CHECK:     }
+
+! CHECK-LABEL:  func.func @_QPomp_taskloop_reduction
+! CHECK:          %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_reductionEi"}
+! CHECK:          %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFomp_taskloop_reductionEx"}
+! CHECK:          %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFomp_taskloop_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:          %[[INIT_X:.*]] = arith.constant 0 : i32
+! CHECK:          hlfir.assign %[[INIT_X]] to %[[DECL_X]]#0 : i32, !fir.ref<i32>
+subroutine omp_taskloop_reduction()
+   integer x
+   x = 0
+   ! CHECK:       omp.taskloop private(@[[PRIVATE_I]] 
+   ! CHECK-SAME:  %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref<i32>) reduction(@[[ADD_RED_I32]] %[[DECL_X]]#0 -> %[[ARG1:.*]] : !fir.ref<i32>) {
+   ! CHECK:       %[[VAL_ARG1:.*]]:2 = hlfir.declare %[[ARG1]] 
+   !$omp taskloop reduction(+:x)
+   do i = 1, 100
+      ! CHECK: %[[X_VAL:.*]] = fir.load %[[VAL_ARG1]]#0 : !fir.ref<i32>
+      ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[X_VAL]], %{{.*}} : i32
+      x = x + 1
+      ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[VAL_ARG1]]#0 : i32, !fir.ref<i32>
+   end do
+   !$omp end taskloop
+end subroutine omp_taskloop_reduction
diff --git a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90 b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
index 7d41efd348e50..599821dbe3377 100644
--- a/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
+++ b/flang/test/Parser/OpenMP/dyn-groupprivate-clause.f90
@@ -26,21 +26,21 @@ subroutine f00(n)
 subroutine f01(n)
   implicit none
   integer :: n
-  !$omp target dyn_groupprivate(strict: n)
+  !$omp target dyn_groupprivate(fallback(abort): n)
   !$omp end target
 end
 
 !UNPARSE: SUBROUTINE f01 (n)
 !UNPARSE:  IMPLICIT NONE
 !UNPARSE:  INTEGER n
-!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(STRICT: n)
+!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK(ABORT): n)
 !UNPARSE: !$OMP END TARGET
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginDirective
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
-!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Strict
+!PARSE-TREE: | | Modifier -> OmpFallbackModifier -> Value = Abort
 !PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
 !PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
 !PARSE-TREE: | Flags = None
@@ -49,21 +49,21 @@ subroutine f01(n)
 subroutine f02(n)
   implicit none
   integer :: n
-  !$omp target dyn_groupprivate(fallback, cgroup: n)
+  !$omp target dyn_groupprivate(fallback(default_mem), cgroup: n)
   !$omp end target
 end
 
 !UNPARSE: SUBROUTINE f02 (n)
 !UNPARSE:  IMPLICIT NONE
 !UNPARSE:  INTEGER n
-!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK, CGROUP: n)
+!UNPARSE: !$OMP TARGET DYN_GROUPPRIVATE(FALLBACK(DEFAULT_MEM), CGROUP: n)
 !UNPARSE: !$OMP END TARGET
 !UNPARSE: END SUBROUTINE
 
 !PARSE-TREE: OmpBeginDirective
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> DynGroupprivate -> OmpDynGroupprivateClause
-!PARSE-TREE: | | Modifier -> OmpPrescriptiveness -> Value = Fallback
+!PARSE-TREE: | | Modifier -> OmpFallbackModifier -> Value = Default_Mem
 !PARSE-TREE: | | Modifier -> OmpAccessGroup -> Value = Cgroup
 !PARSE-TREE: | | Scalar -> Integer -> Expr = 'n'
 !PARSE-TREE: | | | Designator -> DataRef -> Name = 'n'
diff --git a/flang/test/Semantics/OpenMP/dyn-groupprivate.f90 b/flang/test/Semantics/OpenMP/dyn-groupprivate.f90
new file mode 100644
index 0000000000000..f77a0b0d35f44
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/dyn-groupprivate.f90
@@ -0,0 +1,8 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=61
+
+subroutine f00(x)
+  integer :: x
+  !ERROR: The access-group modifier can only occur on a single clause in a construct
+  !$omp target dyn_groupprivate(cgroup: x), dyn_groupprivate(10)
+  !$omp end target
+end
diff --git a/flang/test/Semantics/OpenMP/in-reduction.f90 b/flang/test/Semantics/OpenMP/in-reduction.f90
index 1b82134b7104b..3f1e735214061 100644
--- a/flang/test/Semantics/OpenMP/in-reduction.f90
+++ b/flang/test/Semantics/OpenMP/in-reduction.f90
@@ -47,6 +47,7 @@ subroutine f06
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the IN_REDUCTION clause
 !ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier
 !$omp target in_reduction(+: x%a(2))
 !$omp end target
@@ -57,6 +58,7 @@ subroutine f07
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the IN_REDUCTION clause
 !ERROR: The base expression of an array element or section in IN_REDUCTION clause must be an identifier
 !$omp target in_reduction(+: x%a(1:10))
 !$omp end target
diff --git a/flang/test/Semantics/OpenMP/reduction15.f90 b/flang/test/Semantics/OpenMP/reduction15.f90
index 1d4de6ff702bb..61fa417f1111c 100644
--- a/flang/test/Semantics/OpenMP/reduction15.f90
+++ b/flang/test/Semantics/OpenMP/reduction15.f90
@@ -13,6 +13,7 @@ module m
 
   subroutine f00
     type(t) :: x
+  !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause
   !ERROR: The base expression of an array element or section in REDUCTION clause must be an identifier
   !$omp do reduction (+ : x%a(2))
     do i = 1, 10
@@ -22,6 +23,7 @@ subroutine f00
 
   subroutine f01
     type(t) :: x
+  !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause
   !ERROR: The base expression of an array element or section in REDUCTION clause must be an identifier
   !$omp do reduction (+ : x%a(1:10))
     do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/reduction17.f90 b/flang/test/Semantics/OpenMP/reduction17.f90
new file mode 100644
index 0000000000000..5b6e8e977f46c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction17.f90
@@ -0,0 +1,18 @@
+! Test that Structure Component Array Elements are caught by Semantics and return an error
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=45
+
+type test_type
+    integer :: array(2)
+end type
+
+contains
+    subroutine test
+        type(test_type) :: x
+
+        !ERROR: A variable that is part of another variable cannot appear on the REDUCTION clause
+        !$omp do reduction(+: x%array(2))
+        do i=1, 2
+        end do
+        !$omp end do
+    end subroutine
+end
diff --git a/flang/test/Semantics/OpenMP/task-reduction.f90 b/flang/test/Semantics/OpenMP/task-reduction.f90
index 5a18ee48e7728..f76b07ae568f4 100644
--- a/flang/test/Semantics/OpenMP/task-reduction.f90
+++ b/flang/test/Semantics/OpenMP/task-reduction.f90
@@ -47,6 +47,7 @@ subroutine f06
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the TASK_REDUCTION clause
 !ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier
 !$omp taskgroup task_reduction(+: x%a(2))
 !$omp end taskgroup
@@ -57,6 +58,7 @@ subroutine f07
     integer :: a(10)
   end type
   type(t) :: x
+!ERROR: A variable that is part of another variable cannot appear on the TASK_REDUCTION clause
 !ERROR: The base expression of an array element or section in TASK_REDUCTION clause must be an identifier
 !$omp taskgroup task_reduction(+: x%a(1:10))
 !$omp end taskgroup
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 3845ec8376794..6d3036dfedddf 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -859,7 +859,6 @@ set(files
   __type_traits/is_reference.h
   __type_traits/is_reference_wrapper.h
   __type_traits/is_referenceable.h
-  __type_traits/is_replaceable.h
   __type_traits/is_same.h
   __type_traits/is_scalar.h
   __type_traits/is_signed.h
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index aef036a2c9586..92ff5c701e0d3 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -75,9 +75,7 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
 
 public:
   // exception_ptr is basically a COW string so it is trivially relocatable.
-  // It is also replaceable because assignment has normal value semantics.
   using __trivially_relocatable _LIBCPP_NODEBUG = exception_ptr;
-  using __replaceable _LIBCPP_NODEBUG           = exception_ptr;
 
   _LIBCPP_HIDE_FROM_ABI exception_ptr() _NOEXCEPT : __ptr_() {}
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 8b3eeebd38ae7..be37e8ab66ac4 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -30,7 +30,6 @@
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_reference.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_constructible.h>
@@ -472,8 +471,6 @@ class expected : private __expected_base<_Tp, _Err> {
       __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value && __libcpp_is_trivially_relocatable<_Err>::value,
                       expected,
                       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<_Tp> && __is_replaceable_v<_Err>, expected, void>;
 
   template <class _Up>
   using rebind = expected<_Up, error_type>;
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index eb7b7786208e8..0948bd29b6f1b 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -57,9 +57,8 @@ _LIBCPP_HIDE_FROM_ABI const _Facet& use_facet(const locale&);
 class _LIBCPP_EXPORTED_FROM_ABI locale {
 public:
   // locale is essentially a shared_ptr that doesn't support weak_ptrs and never got a move constructor,
-  // so it is trivially relocatable. Like shared_ptr, it is also replaceable.
+  // so it is trivially relocatable.
   using __trivially_relocatable _LIBCPP_NODEBUG = locale;
-  using __replaceable _LIBCPP_NODEBUG           = locale;
 
   // types:
   class _LIBCPP_EXPORTED_FROM_ABI facet;
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index e90db587d2836..67b94114988b5 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -317,10 +317,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI shared_ptr {
 #endif
 
   // A shared_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
-  // any bookkeeping, so it's always trivially relocatable. It is also replaceable because assignment just rebinds the
-  // shared_ptr to manage a different object.
+  // any bookkeeping, so it's always trivially relocatable.
   using __trivially_relocatable _LIBCPP_NODEBUG = shared_ptr;
-  using __replaceable _LIBCPP_NODEBUG           = shared_ptr;
 
 private:
   element_type* __ptr_;
@@ -1186,9 +1184,8 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI weak_ptr {
 #endif
 
   // A weak_ptr contains only two raw pointers which point to the heap and move constructing already doesn't require
-  // any bookkeeping, so it's always trivially relocatable. It's also replaceable for the same reason.
+  // any bookkeeping, so it's always trivially relocatable.
   using __trivially_relocatable _LIBCPP_NODEBUG = weak_ptr;
-  using __replaceable _LIBCPP_NODEBUG           = weak_ptr;
 
 private:
   element_type* __ptr_;
diff --git a/libcxx/include/__memory/unique_ptr.h b/libcxx/include/__memory/unique_ptr.h
index eff24546cdc01..491d1c2e42417 100644
--- a/libcxx/include/__memory/unique_ptr.h
+++ b/libcxx/include/__memory/unique_ptr.h
@@ -39,7 +39,6 @@
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_pointer.h>
 #include <__type_traits/is_reference.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_relocatable.h>
@@ -145,8 +144,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr {
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __is_replaceable_v<deleter_type>, unique_ptr, void>;
 
 private:
   _LIBCPP_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
@@ -413,8 +410,6 @@ class _LIBCPP_UNIQUE_PTR_TRIVIAL_ABI unique_ptr<_Tp[], _Dp> {
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<deleter_type>::value,
       unique_ptr,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __is_replaceable_v<deleter_type>, unique_ptr, void>;
 
 private:
   template <class _Up, class _OtherDeleter>
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 15368a3bc8955..1e05e4df8ba0f 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -30,7 +30,6 @@
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/is_trivially_relocatable.h>
@@ -484,10 +483,6 @@ public:
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       __split_buffer,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      __split_buffer,
-                      void>;
 
   __split_buffer(const __split_buffer&)            = delete;
   __split_buffer& operator=(const __split_buffer&) = delete;
diff --git a/libcxx/include/__type_traits/is_replaceable.h b/libcxx/include/__type_traits/is_replaceable.h
deleted file mode 100644
index e1d17c099cd3a..0000000000000
--- a/libcxx/include/__type_traits/is_replaceable.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H
-#define _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H
-
-#include <__config>
-#include <__type_traits/enable_if.h>
-#include <__type_traits/integral_constant.h>
-#include <__type_traits/is_same.h>
-#include <__type_traits/is_trivially_copyable.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// A type is replaceable if, with `x` and `y` being different objects, `x = std::move(y)` is equivalent to:
-//
-//  std::destroy_at(&x)
-//  std::construct_at(&x, std::move(y))
-//
-// This allows turning a move-assignment into a sequence of destroy + move-construct, which
-// is often more efficient. This is especially relevant when the move-construct is in fact
-// part of a trivial relocation from somewhere else, in which case there is a huge win.
-//
-// Note that this requires language support in order to be really effective, but we
-// currently emulate the base template with something very conservative.
-template <class _Tp, class = void>
-struct __is_replaceable : is_trivially_copyable<_Tp> {};
-
-template <class _Tp>
-struct __is_replaceable<_Tp, __enable_if_t<is_same<_Tp, typename _Tp::__replaceable>::value> > : true_type {};
-
-template <class _Tp>
-inline const bool __is_replaceable_v = __is_replaceable<_Tp>::value;
-
-// Determines whether an allocator member of a container is replaceable.
-//
-// First, we require the allocator type to be considered replaceable. If not, then something fishy might be
-// happening. Assuming the allocator type is replaceable, we conclude replaceability of the allocator as a
-// member of the container if the allocator always compares equal (in which case propagation doesn't matter),
-// or if the allocator always propagates on assignment, which is required in order for move construction and
-// assignment to be equivalent.
-template <class _AllocatorTraits>
-struct __container_allocator_is_replaceable
-    : integral_constant<bool,
-                        __is_replaceable_v<typename _AllocatorTraits::allocator_type> &&
-                            (_AllocatorTraits::is_always_equal::value ||
-                             (_AllocatorTraits::propagate_on_container_move_assignment::value &&
-                              _AllocatorTraits::propagate_on_container_copy_assignment::value))> {};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_REPLACEABLE_H
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index 33694c52430f1..61485123114ba 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -31,8 +31,6 @@
 #include <__type_traits/is_implicitly_default_constructible.h>
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_replaceable.h>
-#include <__type_traits/is_same.h>
 #include <__type_traits/is_swappable.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/nat.h>
@@ -102,7 +100,6 @@ struct pair
       __conditional_t<__libcpp_is_trivially_relocatable<_T1>::value && __libcpp_is_trivially_relocatable<_T2>::value,
                       pair,
                       void>;
-  using __replaceable _LIBCPP_NODEBUG = __conditional_t<__is_replaceable_v<_T1> && __is_replaceable_v<_T2>, pair, void>;
 
   _LIBCPP_HIDE_FROM_ABI pair(pair const&) = default;
   _LIBCPP_HIDE_FROM_ABI pair(pair&&)      = default;
diff --git a/libcxx/include/__vector/vector.h b/libcxx/include/__vector/vector.h
index a100b1675516e..7051e044314ea 100644
--- a/libcxx/include/__vector/vector.h
+++ b/libcxx/include/__vector/vector.h
@@ -54,7 +54,6 @@
 #include <__type_traits/is_nothrow_assignable.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_pointer.h>
-#include <__type_traits/is_replaceable.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_trivially_relocatable.h>
 #include <__type_traits/type_identity.h>
@@ -123,10 +122,6 @@ class vector {
       __libcpp_is_trivially_relocatable<pointer>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       vector,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      vector,
-                      void>;
 
   static_assert(__check_valid_allocator<allocator_type>::value, "");
   static_assert(is_same<typename allocator_type::value_type, value_type>::value,
diff --git a/libcxx/include/array b/libcxx/include/array
index 9643fc1dd9dca..ff46838e2e8e2 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -134,7 +134,6 @@ template <size_t I, class T, size_t N> const T&& get(const array<T, N>&&) noexce
 #  include <__type_traits/is_const.h>
 #  include <__type_traits/is_constructible.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
@@ -176,7 +175,6 @@ template <class _Tp, size_t _Size>
 struct array {
   using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, array, void>;
-  using __replaceable _LIBCPP_NODEBUG = __conditional_t<__is_replaceable_v<_Tp>, array, void>;
 
   // types:
   using __self _LIBCPP_NODEBUG = array;
diff --git a/libcxx/include/deque b/libcxx/include/deque
index cbf4b98e07a5b..08bf8141eb782 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -227,7 +227,6 @@ template <class T, class Allocator, class Predicate>
 #  include <__type_traits/is_convertible.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
@@ -531,10 +530,6 @@ public:
       __libcpp_is_trivially_relocatable<__map>::value && __libcpp_is_trivially_relocatable<allocator_type>::value,
       deque,
       void>;
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<__map> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      deque,
-                      void>;
 
   static_assert(is_nothrow_default_constructible<allocator_type>::value ==
                     is_nothrow_default_constructible<__pointer_allocator>::value,
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 85fe88afe248c..27f60e0c0a055 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -271,10 +271,6 @@ module std_core [system] {
       header "__type_traits/is_referenceable.h"
       export std_core.type_traits.integral_constant
     }
-    module is_replaceable {
-      header "__type_traits/is_replaceable.h"
-      export std_core.type_traits.integral_constant
-    }
     module is_same {
       header "__type_traits/is_same.h"
       export std_core.type_traits.integral_constant
diff --git a/libcxx/include/optional b/libcxx/include/optional
index ef1bfd3ec44c0..a3023622e2067 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -230,7 +230,6 @@ namespace std {
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_object.h>
 #  include <__type_traits/is_reference.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_scalar.h>
 #  include <__type_traits/is_swappable.h>
@@ -631,7 +630,6 @@ public:
 #    endif
   using __trivially_relocatable _LIBCPP_NODEBUG =
       conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>;
-  using __replaceable _LIBCPP_NODEBUG = conditional_t<__is_replaceable_v<_Tp>, optional, void>;
 
 private:
   // Disable the reference extension using this static assert.
diff --git a/libcxx/include/string b/libcxx/include/string
index ede42467b99fe..7a247f5459770 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -632,7 +632,6 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__type_traits/is_generic_transparent_comparator.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_standard_layout.h>
 #  include <__type_traits/is_trivially_constructible.h>
@@ -757,9 +756,6 @@ public:
   // external memory. In such cases, the destructor is responsible for unpoisoning
   // the memory to avoid triggering false positives.
   // Therefore it's crucial to ensure the destructor is called.
-  //
-  // However, it is replaceable since implementing move-assignment as a destroy + move-construct
-  // will maintain the right ASAN state.
   using __trivially_relocatable = void;
 #  else
   using __trivially_relocatable _LIBCPP_NODEBUG = __conditional_t<
@@ -767,10 +763,6 @@ public:
       basic_string,
       void>;
 #  endif
-  using __replaceable _LIBCPP_NODEBUG =
-      __conditional_t<__is_replaceable_v<pointer> && __container_allocator_is_replaceable<__alloc_traits>::value,
-                      basic_string,
-                      void>;
 
 #  if __has_feature(address_sanitizer) && _LIBCPP_INSTRUMENTED_WITH_ASAN
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pointer __asan_volatile_wrapper(pointer const& __ptr) const {
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 0cfcd9a4fd9c5..caa473012a7c4 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -252,7 +252,6 @@ template <class... Types>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_reference.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_relocatable.h>
@@ -596,7 +595,6 @@ class _LIBCPP_NO_SPECIALIZATIONS tuple {
 public:
   using __trivially_relocatable _LIBCPP_NODEBUG =
       __conditional_t<_And<__libcpp_is_trivially_relocatable<_Tp>...>::value, tuple, void>;
-  using __replaceable _LIBCPP_NODEBUG = __conditional_t<_And<__is_replaceable<_Tp>...>::value, tuple, void>;
 
   // [tuple.cnstr]
 
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 8e958581a6b07..df587ccf23843 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -247,7 +247,6 @@ namespace std {
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_reference.h>
-#  include <__type_traits/is_replaceable.h>
 #  include <__type_traits/is_same.h>
 #  include <__type_traits/is_swappable.h>
 #  include <__type_traits/is_trivially_assignable.h>
@@ -1172,7 +1171,6 @@ class _LIBCPP_DECLSPEC_EMPTY_BASES _LIBCPP_NO_SPECIALIZATIONS variant
 public:
   using __trivially_relocatable _LIBCPP_NODEBUG =
       conditional_t<_And<__libcpp_is_trivially_relocatable<_Types>...>::value, variant, void>;
-  using __replaceable _LIBCPP_NODEBUG = conditional_t<_And<__is_replaceable<_Types>...>::value, variant, void>;
 
   template <bool _Dummy                                                                               = true,
             enable_if_t<__dependent_type<is_default_constructible<__first_type>, _Dummy>::value, int> = 0>
diff --git a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
deleted file mode 100644
index c04e9443c8e67..0000000000000
--- a/libcxx/test/libcxx/type_traits/is_replaceable.compile.pass.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <__type_traits/is_replaceable.h>
-#include <array>
-#include <deque>
-#include <exception>
-#include <expected>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <variant>
-#include <vector>
-
-#include "constexpr_char_traits.h"
-#include "test_allocator.h"
-#include "test_macros.h"
-
-#ifndef TEST_HAS_NO_LOCALIZATION
-#  include <locale>
-#endif
-
-template <class T>
-struct NonPropagatingStatefulMoveAssignAlloc : std::allocator<T> {
-  using propagate_on_container_move_assignment = std::false_type;
-  using is_always_equal                        = std::false_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatefulMoveAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonPropagatingStatefulCopyAssignAlloc : std::allocator<T> {
-  using propagate_on_container_copy_assignment = std::false_type;
-  using is_always_equal                        = std::false_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatefulCopyAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonPropagatingStatelessMoveAssignAlloc : std::allocator<T> {
-  using propagate_on_container_move_assignment = std::false_type;
-  using is_always_equal                        = std::true_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatelessMoveAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonPropagatingStatelessCopyAssignAlloc : std::allocator<T> {
-  using propagate_on_container_copy_assignment = std::false_type;
-  using is_always_equal                        = std::true_type;
-  template <class U>
-  struct rebind {
-    using other = NonPropagatingStatelessCopyAssignAlloc<U>;
-  };
-};
-
-template <class T>
-struct NonReplaceableStatelessAlloc : std::allocator<T> {
-  // Ensure that we don't consider an allocator that is a member of a container to be
-  // replaceable if it's not replaceable, even if it always compares equal and always propagates.
-  using propagate_on_container_move_assignment = std::true_type;
-  using propagate_on_container_copy_assignment = std::true_type;
-  using is_always_equal                        = std::true_type;
-  NonReplaceableStatelessAlloc()               = default;
-  NonReplaceableStatelessAlloc(NonReplaceableStatelessAlloc const&) {}
-  NonReplaceableStatelessAlloc(NonReplaceableStatelessAlloc&&) = default;
-  template <class U>
-  struct rebind {
-    using other = NonReplaceableStatelessAlloc<U>;
-  };
-};
-static_assert(!std::__is_replaceable<NonReplaceableStatelessAlloc<int> >::value, "");
-
-static_assert(!std::__is_replaceable<test_allocator<char> >::value, ""); // we use that property below
-
-struct Empty {};
-static_assert(std::__is_replaceable<char>::value, "");
-static_assert(std::__is_replaceable<int>::value, "");
-static_assert(std::__is_replaceable<double>::value, "");
-static_assert(std::__is_replaceable<Empty>::value, "");
-
-struct TriviallyCopyable {
-  char c;
-  int i;
-  Empty s;
-};
-static_assert(std::__is_replaceable<TriviallyCopyable>::value, "");
-
-struct NotTriviallyCopyable {
-  NotTriviallyCopyable(const NotTriviallyCopyable&);
-  ~NotTriviallyCopyable();
-};
-static_assert(!std::__is_replaceable<NotTriviallyCopyable>::value, "");
-
-struct MoveOnlyTriviallyCopyable {
-  MoveOnlyTriviallyCopyable(const MoveOnlyTriviallyCopyable&)            = delete;
-  MoveOnlyTriviallyCopyable& operator=(const MoveOnlyTriviallyCopyable&) = delete;
-  MoveOnlyTriviallyCopyable(MoveOnlyTriviallyCopyable&&)                 = default;
-  MoveOnlyTriviallyCopyable& operator=(MoveOnlyTriviallyCopyable&&)      = default;
-};
-static_assert(std::__is_replaceable<MoveOnlyTriviallyCopyable>::value, "");
-
-struct CustomCopyAssignment {
-  CustomCopyAssignment(const CustomCopyAssignment&) = default;
-  CustomCopyAssignment(CustomCopyAssignment&&)      = default;
-  CustomCopyAssignment& operator=(const CustomCopyAssignment&);
-  CustomCopyAssignment& operator=(CustomCopyAssignment&&) = default;
-};
-static_assert(!std::__is_replaceable<CustomCopyAssignment>::value, "");
-
-struct CustomMoveAssignment {
-  CustomMoveAssignment(const CustomMoveAssignment&)            = default;
-  CustomMoveAssignment(CustomMoveAssignment&&)                 = default;
-  CustomMoveAssignment& operator=(const CustomMoveAssignment&) = default;
-  CustomMoveAssignment& operator=(CustomMoveAssignment&&);
-};
-static_assert(!std::__is_replaceable<CustomMoveAssignment>::value, "");
-
-// library-internal types
-// ----------------------
-
-// __split_buffer
-static_assert(
-    std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_pointer_layout> >::value,
-    "");
-static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable,
-                                                        std::allocator<NotTriviallyCopyable>,
-                                                        std::__split_buffer_pointer_layout> >::value,
-              "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_pointer_layout > >::
-        value,
-    "");
-
-static_assert(
-    std::__is_replaceable<std::__split_buffer<int, std::allocator<int>, std::__split_buffer_size_layout> >::value, "");
-static_assert(std::__is_replaceable<std::__split_buffer<NotTriviallyCopyable,
-                                                        std::allocator<NotTriviallyCopyable>,
-                                                        std::__split_buffer_size_layout> >::value,
-              "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::value,
-    "");
-static_assert(
-    !std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatefulMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessCopyAssignAlloc<int>, std::__split_buffer_size_layout > >::
-        value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::__split_buffer<int, NonPropagatingStatelessMoveAssignAlloc<int>, std::__split_buffer_size_layout > >::
-        value,
-    "");
-
-// standard library types
-// ----------------------
-
-// array
-static_assert(std::__is_replaceable<std::array<int, 0> >::value, "");
-static_assert(std::__is_replaceable<std::array<NotTriviallyCopyable, 0> >::value, "");
-static_assert(std::__is_replaceable<std::array<std::unique_ptr<int>, 0> >::value, "");
-
-static_assert(std::__is_replaceable<std::array<int, 1> >::value, "");
-static_assert(!std::__is_replaceable<std::array<NotTriviallyCopyable, 1> >::value, "");
-static_assert(std::__is_replaceable<std::array<std::unique_ptr<int>, 1> >::value, "");
-
-// basic_string
-struct MyChar {
-  char c;
-};
-template <class T>
-struct NotReplaceableCharTraits : constexpr_char_traits<T> {
-  NotReplaceableCharTraits(const NotReplaceableCharTraits&);
-  NotReplaceableCharTraits& operator=(const NotReplaceableCharTraits&);
-  ~NotReplaceableCharTraits();
-};
-
-static_assert(std::__is_replaceable<std::basic_string<char, std::char_traits<char>, std::allocator<char> > >::value,
-              "");
-static_assert(
-    std::__is_replaceable<std::basic_string<char, NotReplaceableCharTraits<char>, std::allocator<char> > >::value, "");
-static_assert(
-    std::__is_replaceable<std::basic_string<MyChar, constexpr_char_traits<MyChar>, std::allocator<MyChar> > >::value,
-    "");
-static_assert(!std::__is_replaceable<std::basic_string<char, std::char_traits<char>, test_allocator<char> > >::value,
-              "");
-static_assert(!std::__is_replaceable<
-                  std::basic_string<char, std::char_traits<char>, NonReplaceableStatelessAlloc<char> > >::value,
-              "");
-static_assert(std::__is_replaceable<
-                  std::basic_string<MyChar, NotReplaceableCharTraits<MyChar>, std::allocator<MyChar> > >::value,
-              "");
-static_assert(
-    !std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatefulCopyAssignAlloc<char> > >::value,
-    "");
-static_assert(
-    !std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatefulMoveAssignAlloc<char> > >::value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatelessCopyAssignAlloc<char> > >::value,
-    "");
-static_assert(
-    std::__is_replaceable<
-        std::basic_string<char, std::char_traits<char>, NonPropagatingStatelessMoveAssignAlloc<char> > >::value,
-    "");
-
-// deque
-static_assert(std::__is_replaceable<std::deque<int> >::value, "");
-static_assert(std::__is_replaceable<std::deque<NotTriviallyCopyable> >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, test_allocator<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, NonReplaceableStatelessAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::deque<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::deque<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::deque<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value, "");
-
-// exception_ptr
-#ifndef _LIBCPP_ABI_MICROSOFT
-static_assert(std::__is_replaceable<std::exception_ptr>::value, "");
-#endif
-
-// expected
-#if TEST_STD_VER >= 23
-static_assert(std::__is_replaceable<std::expected<int, int> >::value);
-static_assert(!std::__is_replaceable<std::expected<CustomCopyAssignment, int>>::value);
-static_assert(!std::__is_replaceable<std::expected<int, CustomCopyAssignment>>::value);
-static_assert(!std::__is_replaceable<std::expected<CustomCopyAssignment, CustomCopyAssignment>>::value);
-#endif
-
-// locale
-#ifndef TEST_HAS_NO_LOCALIZATION
-static_assert(std::__is_replaceable<std::locale>::value, "");
-#endif
-
-// optional
-#if TEST_STD_VER >= 17
-static_assert(std::__is_replaceable<std::optional<int>>::value, "");
-static_assert(!std::__is_replaceable<std::optional<CustomCopyAssignment>>::value, "");
-#endif
-
-// pair
-static_assert(std::__is_replaceable<std::pair<int, int> >::value, "");
-static_assert(!std::__is_replaceable<std::pair<CustomCopyAssignment, int> >::value, "");
-static_assert(!std::__is_replaceable<std::pair<int, CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::pair<CustomCopyAssignment, CustomCopyAssignment> >::value, "");
-
-// shared_ptr
-static_assert(std::__is_replaceable<std::shared_ptr<int> >::value, "");
-
-// tuple
-#if TEST_STD_VER >= 11
-static_assert(std::__is_replaceable<std::tuple<> >::value, "");
-
-static_assert(std::__is_replaceable<std::tuple<int> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment> >::value, "");
-
-static_assert(std::__is_replaceable<std::tuple<int, int> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment, int> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<int, CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::tuple<CustomCopyAssignment, CustomCopyAssignment> >::value, "");
-#endif // TEST_STD_VER >= 11
-
-// unique_ptr
-struct NonReplaceableDeleter {
-  NonReplaceableDeleter(const NonReplaceableDeleter&);
-  NonReplaceableDeleter& operator=(const NonReplaceableDeleter&);
-  ~NonReplaceableDeleter();
-
-  template <class T>
-  void operator()(T*);
-};
-
-struct NonReplaceablePointer {
-  struct pointer {
-    pointer(const pointer&);
-    pointer& operator=(const pointer&);
-    ~pointer();
-  };
-
-  template <class T>
-  void operator()(T*);
-};
-
-static_assert(std::__is_replaceable<std::unique_ptr<int> >::value, "");
-static_assert(std::__is_replaceable<std::unique_ptr<CustomCopyAssignment> >::value, "");
-static_assert(std::__is_replaceable<std::unique_ptr<int[]> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int, NonReplaceableDeleter> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int[], NonReplaceableDeleter> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int, NonReplaceablePointer> >::value, "");
-static_assert(!std::__is_replaceable<std::unique_ptr<int[], NonReplaceablePointer> >::value, "");
-
-// variant
-#if TEST_STD_VER >= 17
-static_assert(std::__is_replaceable<std::variant<int> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment> >::value, "");
-
-static_assert(std::__is_replaceable<std::variant<int, int> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment, int> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<int, CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::variant<CustomCopyAssignment, CustomCopyAssignment> >::value, "");
-#endif // TEST_STD_VER >= 17
-
-// vector
-static_assert(std::__is_replaceable<std::vector<int> >::value, "");
-static_assert(std::__is_replaceable<std::vector<CustomCopyAssignment> >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, test_allocator<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, NonReplaceableStatelessAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, NonPropagatingStatefulCopyAssignAlloc<int> > >::value, "");
-static_assert(!std::__is_replaceable<std::vector<int, NonPropagatingStatefulMoveAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::vector<int, NonPropagatingStatelessCopyAssignAlloc<int> > >::value, "");
-static_assert(std::__is_replaceable<std::vector<int, NonPropagatingStatelessMoveAssignAlloc<int> > >::value, "");
-
-// weak_ptr
-static_assert(std::__is_replaceable<std::weak_ptr<CustomCopyAssignment> >::value, "");
-
-// TODO: Mark all the replaceable STL types as such
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
deleted file mode 100644
index 5da1d9afee911..0000000000000
--- a/libcxx/utils/libcxx/test/features.py
+++ /dev/null
@@ -1,920 +0,0 @@
-# ===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ===----------------------------------------------------------------------===##
-
-from libcxx.test.dsl import *
-from lit.BooleanExpression import BooleanExpression
-import re
-import shutil
-import subprocess
-import sys
-
-_isAnyClang = lambda cfg: "__clang__" in compilerMacros(cfg)
-_isAppleClang = lambda cfg: "__apple_build_version__" in compilerMacros(cfg)
-_isAnyGCC = lambda cfg: "__GNUC__" in compilerMacros(cfg)
-_isClang = lambda cfg: _isAnyClang(cfg) and not _isAppleClang(cfg)
-_isGCC = lambda cfg: _isAnyGCC(cfg) and not _isAnyClang(cfg)
-_isAnyClangOrGCC = lambda cfg: _isAnyClang(cfg) or _isAnyGCC(cfg)
-_isClExe = lambda cfg: not _isAnyClangOrGCC(cfg)
-_isMSVC = lambda cfg: "_MSC_VER" in compilerMacros(cfg)
-_msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(compilerMacros(cfg)["_MSC_VER"]) % 100)
-
-def _getAndroidDeviceApi(cfg):
-    return int(
-        programOutput(
-            cfg,
-            r"""
-                #include <android/api-level.h>
-                #include <stdio.h>
-                int main(int, char**) {
-                    printf("%d\n", android_get_device_api_level());
-                    return 0;
-                }
-            """,
-        )
-    )
-
-
-def _mingwSupportsModules(cfg):
-    # Only mingw headers are known to work with libc++ built as a module,
-    # at the moment.
-    if not "__MINGW32__" in compilerMacros(cfg):
-        return False
-    # For mingw headers, check for a version known to support being built
-    # as a module.
-    return sourceBuilds(
-        cfg,
-        """
-        #include <_mingw_mac.h>
-        #if __MINGW64_VERSION_MAJOR < 12
-        #error Headers known to be incompatible
-        #elif __MINGW64_VERSION_MAJOR == 12
-        // The headers were fixed to work with libc++ modules during
-        // __MINGW64_VERSION_MAJOR == 12. The headers became compatible
-        // with libc++ built as a module in
-        // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the
-        // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed
-        // removed the now unused __mingw_static_ovr define. Use this
-        // as indicator for whether we've got new enough headers.
-        #ifdef __mingw_static_ovr
-        #error Headers too old
-        #endif
-        #else
-        // __MINGW64_VERSION_MAJOR > 12 should be ok.
-        #endif
-        int main(int, char**) { return 0; }
-        """,
-    )
-
-
-# Lit features are evaluated in order. Some checks may require the compiler detection to have
-# run first in order to work properly.
-DEFAULT_FEATURES = [
-    # gcc-style-warnings detects compilers that understand -Wno-meow flags, unlike MSVC's compiler driver cl.exe.
-    Feature(name="gcc-style-warnings", when=_isAnyClangOrGCC),
-    Feature(name="cl-style-warnings", when=_isClExe),
-    Feature(name="apple-clang", when=_isAppleClang),
-    Feature(
-        name=lambda cfg: "apple-clang-{__clang_major__}".format(**compilerMacros(cfg)),
-        when=_isAppleClang,
-    ),
-    Feature(
-        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
-        when=_isAppleClang,
-    ),
-    Feature(
-        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
-        when=_isAppleClang,
-    ),
-    Feature(name="clang", when=_isClang),
-    Feature(
-        name=lambda cfg: "clang-{__clang_major__}".format(**compilerMacros(cfg)),
-        when=_isClang,
-    ),
-    Feature(
-        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
-        when=_isClang,
-    ),
-    Feature(
-        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
-        when=_isClang,
-    ),
-    # Note: Due to a GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104760), we must disable deprecation warnings
-    #       on GCC or spurious diagnostics are issued.
-    #
-    # TODO:
-    # - Enable -Wplacement-new with GCC.
-    # - Enable -Wclass-memaccess with GCC.
-    Feature(
-        name="gcc",
-        when=_isGCC,
-        actions=[
-            AddCompileFlag("-D_LIBCPP_DISABLE_DEPRECATION_WARNINGS"),
-            AddCompileFlag("-Wno-placement-new"),
-            AddCompileFlag("-Wno-class-memaccess"),
-            AddFeature("GCC-ALWAYS_INLINE-FIXME"),
-        ],
-    ),
-    Feature(
-        name=lambda cfg: "gcc-{__GNUC__}".format(**compilerMacros(cfg)), when=_isGCC
-    ),
-    Feature(
-        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}".format(**compilerMacros(cfg)),
-        when=_isGCC,
-    ),
-    Feature(
-        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}.{__GNUC_PATCHLEVEL__}".format(**compilerMacros(cfg)),
-        when=_isGCC,
-    ),
-    Feature(name="msvc", when=_isMSVC),
-    Feature(name=lambda cfg: "msvc-{}".format(*_msvcVersion(cfg)), when=_isMSVC),
-    Feature(name=lambda cfg: "msvc-{}.{}".format(*_msvcVersion(cfg)), when=_isMSVC),
-
-    Feature(
-        name="diagnose-if-support",
-        when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"),
-        actions=[AddCompileFlag("-Wuser-defined-warnings")],
-    ),
-    Feature(
-        name="character-conversion-warnings",
-        when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"),
-    ),
-    # Tests to validate whether the compiler has a way to set the maximum number
-    # of steps during constant evaluation. Since the flag differs per compiler
-    # store the "valid" flag as a feature. This allows passing the proper compile
-    # flag to the compiler:
-    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12345678
-    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=12345678
-    Feature(
-        name="has-fconstexpr-steps",
-        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-steps=1"),
-    ),
-    Feature(
-        name="has-fconstexpr-ops-limit",
-        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-ops-limit=1"),
-    ),
-    Feature(name="has-fblocks", when=lambda cfg: hasCompileFlag(cfg, "-fblocks")),
-    Feature(
-        name="fdelayed-template-parsing",
-        when=lambda cfg: hasCompileFlag(cfg, "-fdelayed-template-parsing"),
-    ),
-    Feature(
-        name="has-fobjc-arc",
-        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc")
-        and sys.platform.lower().strip() == "darwin",
-    ),  # TODO: this doesn't handle cross-compiling to Apple platforms.
-    Feature(
-        name="objective-c++",
-        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc"),
-    ),
-    Feature(
-        name="verify-support",
-        when=lambda cfg: hasCompileFlag(cfg, "-Xclang -verify-ignore-unexpected"),
-    ),
-    Feature(
-        name="add-latomic-workaround",  # https://llvm.org/PR73361
-        when=lambda cfg: sourceBuilds(
-            cfg, "int main(int, char**) { return 0; }", ["-latomic"]
-        ),
-        actions=[AddLinkFlag("-latomic")],
-    ),
-    Feature(
-        name="has-64-bit-atomics",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <atomic>
-            struct Large { char storage[64/8]; };
-            std::atomic<Large> x;
-            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
-          """,
-        ),
-    ),
-    Feature(
-        name="has-1024-bit-atomics",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <atomic>
-            struct Large { char storage[1024/8]; };
-            std::atomic<Large> x;
-            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
-          """,
-        ),
-    ),
-    # Tests that require 64-bit architecture
-    Feature(
-        name="32-bit-pointer",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            int main(int, char**) {
-              static_assert(sizeof(void *) == 4);
-            }
-          """,
-        ),
-    ),
-    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0):
-    # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678
-    Feature(
-        name="win32-broken-utf8-wchar-ctype",
-        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
-        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
-        and "_WIN32" in compilerMacros(cfg)
-        and not programSucceeds(
-            cfg,
-            """
-            #include <locale.h>
-            #include <wctype.h>
-            int main(int, char**) {
-              setlocale(LC_ALL, "en_US.UTF-8");
-              return towlower(L'\\xDA') != L'\\xFA';
-            }
-          """,
-        ),
-    ),
-    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.19041.0).
-    # https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837
-    Feature(
-        name="win32-broken-printf-g-precision",
-        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
-        and not programSucceeds(
-            cfg,
-            """
-            #include <stdio.h>
-            #include <string.h>
-            int main(int, char**) {
-              char buf[100];
-              snprintf(buf, sizeof(buf), "%#.*g", 0, 0.0);
-              return strcmp(buf, "0.");
-            }
-          """,
-        ),
-    ),
-    # Check for a Windows UCRT bug (not fixed upstream yet).
-    # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0",
-    # while other C runtimes produce just "0x0p+0".
-    # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
-    Feature(
-        name="win32-broken-printf-a-precision",
-        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
-        and not programSucceeds(
-            cfg,
-            """
-            #include <stdio.h>
-            #include <string.h>
-            int main(int, char**) {
-              char buf[100];
-              snprintf(buf, sizeof(buf), "%a", 0.0);
-              return strcmp(buf, "0x0p+0");
-            }
-          """,
-        ),
-    ),
-    # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had
-    # mon_decimal_point == ".", which our tests don't handle.
-    Feature(
-        name="glibc-old-ru_RU-decimal-point",
-        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
-        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
-        and not programSucceeds(
-            cfg,
-            """
-            #include <locale.h>
-            #include <string.h>
-            int main(int, char**) {
-              setlocale(LC_ALL, "ru_RU.UTF-8");
-              return strcmp(localeconv()->mon_decimal_point, ",");
-            }
-          """,
-        ),
-    ),
-    Feature(
-        name="has-unix-headers",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <unistd.h>
-            #include <sys/wait.h>
-            int main(int, char**) {
-              int fd[2];
-              return pipe(fd);
-            }
-          """,
-        ),
-    ),
-    # Whether Bash can run on the executor.
-    # This is not always the case, for example when running on embedded systems.
-    #
-    # For the corner case of bash existing, but it being missing in the path
-    # set in %{exec} as "--env PATH=one-single-dir", the executor does find
-    # and executes bash, but bash then can't find any other common shell
-    # utilities. Test executing "bash -c 'bash --version'" to see if bash
-    # manages to find binaries to execute.
-    Feature(
-        name="executor-has-no-bash",
-        when=lambda cfg: runScriptExitCode(cfg, ["%{exec} bash -c 'bash --version'"]) != 0,
-    ),
-    # Whether module support for the platform is available.
-    Feature(
-        name="has-no-cxx-module-support",
-        # The libc of these platforms have functions with internal linkage.
-        # This is not allowed per C11 7.1.2 Standard headers/6
-        #  Any declaration of a library function shall have external linkage.
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
-        or "__FreeBSD__" in compilerMacros(cfg)
-        or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg))
-        or platform.system().lower().startswith("aix")
-        # Avoid building on platforms that don't support modules properly.
-        or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier")
-        # older versions don't support extern "C++", newer versions don't support main in named module.
-        or not (
-            sourceBuilds(
-                cfg,
-                """
-            export module test;
-            extern "C++" int main(int, char**) { return 0; }
-          """,
-            )
-            or sourceBuilds(
-                cfg,
-                """
-            export module test;
-            int main(int, char**) { return 0; }
-          """,
-            )
-        ),
-    ),
-    # The time zone validation tests compare the output of zdump against the
-    # output generated by <chrono>'s time zone support.
-    Feature(
-        name="has-no-zdump",
-        when=lambda cfg: runScriptExitCode(cfg, ["zdump --version"]) != 0,
-    ),
-]
-
-# Deduce and add the test features that that are implied by the #defines in
-# the <__config> header.
-#
-# For each macro of the form `_LIBCPP_XXX_YYY_ZZZ` defined below that
-# is defined after including <__config>, add a Lit feature called
-# `libcpp-xxx-yyy-zzz`. When a macro is defined to a specific value
-# (e.g. `_LIBCPP_ABI_VERSION=2`), the feature is `libcpp-xxx-yyy-zzz=<value>`.
-#
-# Note that features that are more strongly tied to libc++ are named libcpp-foo,
-# while features that are more general in nature are not prefixed with 'libcpp-'.
-macros = {
-    "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
-    "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
-    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY": "libcpp-has-abi-bounded-iterators-in-std-array",
-    "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr",
-    "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type",
-    "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
-    "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding",
-    "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
-}
-for macro, feature in macros.items():
-    DEFAULT_FEATURES.append(
-        Feature(
-            name=lambda cfg, m=macro, f=feature: f + ("={}".format(compilerMacros(cfg)[m]) if compilerMacros(cfg)[m] else ""),
-            when=lambda cfg, m=macro: m in compilerMacros(cfg),
-        )
-    )
-
-true_false_macros = {
-    "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external",
-    "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread",
-}
-for macro, feature in true_false_macros.items():
-    DEFAULT_FEATURES.append(
-        Feature(
-            name=feature,
-            when=lambda cfg, m=macro: m in compilerMacros(cfg)
-            and compilerMacros(cfg)[m] == "1",
-        )
-    )
-
-inverted_macros = {
-    "_LIBCPP_HAS_TIME_ZONE_DATABASE": "no-tzdb",
-    "_LIBCPP_HAS_FILESYSTEM": "no-filesystem",
-    "_LIBCPP_HAS_LOCALIZATION": "no-localization",
-    "_LIBCPP_HAS_THREADS": "no-threads",
-    "_LIBCPP_HAS_MONOTONIC_CLOCK": "no-monotonic-clock",
-    "_LIBCPP_HAS_WIDE_CHARACTERS": "no-wide-characters",
-    "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS": "libcpp-has-no-availability-markup",
-    "_LIBCPP_HAS_RANDOM_DEVICE": "no-random-device",
-    "_LIBCPP_HAS_UNICODE": "libcpp-has-no-unicode",
-    "_LIBCPP_HAS_TERMINAL": "no-terminal",
-}
-for macro, feature in inverted_macros.items():
-    DEFAULT_FEATURES.append(
-        Feature(
-            name=feature,
-            when=lambda cfg, m=macro: m in compilerMacros(cfg)
-            and compilerMacros(cfg)[m] == "0",
-        )
-    )
-
-# Mapping from canonical locale names (used in the tests) to possible locale
-# names on various systems. Each locale is considered supported if any of the
-# alternative names is supported.
-locales = {
-    "en_US.UTF-8": ["en_US.UTF-8", "en_US.utf8", "English_United States.1252"],
-    "fr_FR.UTF-8": ["fr_FR.UTF-8", "fr_FR.utf8", "French_France.1252"],
-    "ja_JP.UTF-8": ["ja_JP.UTF-8", "ja_JP.utf8", "Japanese_Japan.923"],
-    "ru_RU.UTF-8": ["ru_RU.UTF-8", "ru_RU.utf8", "Russian_Russia.1251"],
-    "zh_CN.UTF-8": ["zh_CN.UTF-8", "zh_CN.utf8", "Chinese_China.936"],
-    "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"],
-    "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"],
-}
-provide_locale_conversions = {
-    "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"],
-    "ru_RU.UTF-8": ["mon_thousands_sep"],
-}
-for locale, alts in locales.items():
-    # Note: Using alts directly in the lambda body here will bind it to the value at the
-    # end of the loop. Assigning it to a default argument works around this issue.
-    DEFAULT_FEATURES.append(
-        Feature(
-            name="locale.{}".format(locale),
-            when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts),
-            actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction(
-                cfg, locale, alts, provide_locale_conversions[locale]
-            )
-            if locale in provide_locale_conversions
-            and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or
-                 compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1")
-            else [],
-        ),
-    )
-
-
-# Provide environment locale conversions through substitutions to avoid platform specific
-# maintenance.
-def _getLocaleFlagsAction(cfg, locale, alts, members):
-    alts_list = ",".join([f'"{l}"' for l in alts])
-    get_member_list = ",".join([f"lc->{m}" for m in members])
-
-    localeconv_info = programOutput(
-        cfg,
-        r"""
-        #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
-        #define _CRT_SECURE_NO_WARNINGS
-        #endif
-        #include <stdio.h>
-        #include <locale.h>
-        #include <stdlib.h>
-        #include <wchar.h>
-
-        // Print each requested locale conversion member on separate lines.
-        int main(int, char**) {
-          const char* locales[] = { %s };
-          for (int loc_i = 0; loc_i < %d; ++loc_i) {
-            if (!setlocale(LC_ALL, locales[loc_i])) {
-              continue; // Choose first locale name that is recognized.
-            }
-
-            lconv* lc = localeconv();
-            const char* members[] = { %s };
-            for (size_t m_i = 0; m_i < %d; ++m_i) {
-              if (!members[m_i]) {
-                printf("\n"); // member value is an empty string
-                continue;
-              }
-
-              size_t len = mbstowcs(nullptr, members[m_i], 0);
-              if (len == static_cast<size_t>(-1)) {
-                fprintf(stderr, "mbstowcs failed unexpectedly\n");
-                return 1;
-              }
-              // Include room for null terminator. Use malloc as these features
-              // are also used by lit configs that don't use -lc++ (libunwind tests).
-              wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t));
-              size_t ret = mbstowcs(dst, members[m_i], len + 1);
-              if (ret == static_cast<size_t>(-1)) {
-                fprintf(stderr, "mbstowcs failed unexpectedly\n");
-                free(dst);
-                return 1;
-              }
-
-              for (size_t i = 0; i < len; ++i) {
-                if (dst[i] > 0x7F) {
-                  printf("\\u%%04x", dst[i]);
-                } else {
-                  // c++03 does not allow basic ascii-range characters in UCNs
-                  printf("%%c", (char)dst[i]);
-                }
-              }
-              printf("\n");
-              free(dst);
-            }
-            return 0;
-          }
-
-          return 1;
-        }
-        """
-        % (alts_list, len(alts), get_member_list, len(members)),
-    )
-    valid_define_name = re.sub(r"[.-]", "_", locale).upper()
-    return [
-        # Provide locale conversion through a substitution.
-        # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f"
-        AddSubstitution(
-            f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}",
-            lambda cfg, value=value: f"'L\"{value}\"'",
-        )
-        for member, value in zip(members, localeconv_info.split("\n"))
-    ]
-
-
-# Add features representing the target platform name: darwin, linux, windows, etc...
-DEFAULT_FEATURES += [
-    Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)),
-    Feature(name="windows", when=lambda cfg: "_WIN32" in compilerMacros(cfg)),
-    Feature(
-        name="windows-dll",
-        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
-        and sourceBuilds(
-            cfg,
-            """
-            #include <iostream>
-            int main(int, char**) { return 0; }
-          """,
-        )
-        and programSucceeds(
-            cfg,
-            """
-            #include <iostream>
-            #include <windows.h>
-            #include <winnt.h>
-            int main(int, char**) {
-              // Get a pointer to a data member that gets linked from the C++
-              // library. This must be a data member (functions can get
-              // thunk inside the calling executable), and must not be
-              // something that is defined inline in headers.
-              void *ptr = &std::cout;
-              // Get a handle to the current main executable.
-              void *exe = GetModuleHandle(NULL);
-              // The handle points at the PE image header. Navigate through
-              // the header structure to find the size of the PE image (the
-              // executable).
-              PIMAGE_DOS_HEADER dosheader = (PIMAGE_DOS_HEADER)exe;
-              PIMAGE_NT_HEADERS ntheader = (PIMAGE_NT_HEADERS)((BYTE *)dosheader + dosheader->e_lfanew);
-              PIMAGE_OPTIONAL_HEADER peheader = &ntheader->OptionalHeader;
-              void *exeend = (BYTE*)exe + peheader->SizeOfImage;
-              // Check if the tested pointer - the data symbol from the
-              // C++ library - is located within the exe.
-              if (ptr >= exe && ptr <= exeend)
-                return 1;
-              // Return success if it was outside of the executable, i.e.
-              // loaded from a DLL.
-              return 0;
-            }
-          """,
-        ),
-        actions=[AddCompileFlag("-DTEST_WINDOWS_DLL")],
-    ),
-    Feature(name="linux", when=lambda cfg: "__linux__" in compilerMacros(cfg)),
-    Feature(name="android", when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)),
-    Feature(
-        name=lambda cfg: "android-device-api={}".format(_getAndroidDeviceApi(cfg)),
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="LIBCXX-ANDROID-FIXME",
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
-    ),
-    Feature(name="netbsd", when=lambda cfg: "__NetBSD__" in compilerMacros(cfg)),
-    Feature(name="freebsd", when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg)),
-    Feature(
-        name="LIBCXX-FREEBSD-FIXME",
-        when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="LIBCXX-PICOLIBC-FIXME",
-        when=lambda cfg: sourceBuilds(
-            cfg,
-            """
-            #include <string.h>
-            #ifndef __PICOLIBC__
-            #error not picolibc
-            #endif
-            int main(int, char**) { return 0; }
-          """,
-        ),
-    ),
-    Feature(
-        name="LIBCXX-AMDGPU-FIXME",
-        when=lambda cfg: "__AMDGPU__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="LIBCXX-NVPTX-FIXME",
-        when=lambda cfg: "__NVPTX__" in compilerMacros(cfg),
-    ),
-    Feature(
-        name="can-create-symlinks",
-        when=lambda cfg: "_WIN32" not in compilerMacros(cfg)
-        or programSucceeds(
-            cfg,
-            # Creation of symlinks require elevated privileges on Windows unless
-            # Windows developer mode is enabled.
-            """
-            #include <stdio.h>
-            #include <windows.h>
-            int main(int, char**) {
-              CHAR tempDirPath[MAX_PATH];
-              DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
-              if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
-                return 1;
-              }
-
-              CHAR tempFilePath[MAX_PATH];
-              UINT uRetVal = GetTempFileNameA(
-                tempDirPath,
-                "cxx", // Prefix
-                0, // Unique=0 also implies file creation.
-                tempFilePath);
-              if (uRetVal == 0) {
-                return 1;
-              }
-
-              CHAR symlinkFilePath[MAX_PATH];
-              int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath);
-              if (ret == -1) {
-                DeleteFileA(tempFilePath);
-                return 1;
-              }
-
-              // Requires either administrator, or developer mode enabled.
-              BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath,
-                tempFilePath,
-                SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE);
-              if (!bCreatedSymlink) {
-                DeleteFileA(tempFilePath);
-                return 1;
-              }
-
-              DeleteFileA(tempFilePath);
-              DeleteFileA(symlinkFilePath);
-              return 0;
-            }
-            """,
-        ),
-    ),
-]
-
-# Add features representing the build host platform name.
-# The build host could differ from the target platform for cross-compilation.
-DEFAULT_FEATURES += [
-    Feature(name="buildhost={}".format(sys.platform.lower().strip())),
-    # sys.platform can often be represented by a "sub-system", such as 'win32', 'cygwin', 'mingw', freebsd13 & etc.
-    # We define a consolidated feature on a few platforms.
-    Feature(
-        name="buildhost=windows",
-        when=lambda cfg: platform.system().lower().startswith("windows"),
-    ),
-    Feature(
-        name="buildhost=freebsd",
-        when=lambda cfg: platform.system().lower().startswith("freebsd"),
-    ),
-    Feature(
-        name="buildhost=aix",
-        when=lambda cfg: platform.system().lower().startswith("aix"),
-    ),
-]
-
-# Detect whether GDB is on the system, has Python scripting and supports
-# adding breakpoint commands. If so add a substitution to access it.
-def check_gdb(cfg):
-    gdb_path = shutil.which("gdb")
-    if gdb_path is None:
-        return False
-
-    # Check that we can set breakpoint commands, which was added in 8.3.
-    # Using the quit command here means that gdb itself exits, not just
-    # the "python <...>" command.
-    test_src = """\
-try:
-  gdb.Breakpoint(\"main\").commands=\"foo\"
-except AttributeError:
-  gdb.execute(\"quit 1\")
-gdb.execute(\"quit\")"""
-
-    try:
-        stdout = subprocess.check_output(
-            [gdb_path, "-ex", "python " + test_src, "--batch"],
-            stderr=subprocess.DEVNULL,
-            universal_newlines=True,
-        )
-    except subprocess.CalledProcessError:
-        # We can't set breakpoint commands
-        return False
-
-    # Check we actually ran the Python
-    return not "Python scripting is not supported" in stdout
-
-
-DEFAULT_FEATURES += [
-    Feature(
-        name="host-has-gdb-with-python",
-        when=check_gdb,
-        actions=[AddSubstitution("%{gdb}", lambda cfg: shutil.which("gdb"))],
-    )
-]
-
-# Helpers to define correspondances between LLVM versions and vendor system versions.
-# Those are used for backdeployment features below, do not use directly in tests.
-DEFAULT_FEATURES += [
-    Feature(
-        name="_target-has-llvm-22",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "TBD",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-21",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "TBD",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-20",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-21 || target={{.+}}-apple-macosx{{26.[0-9](.\d+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-19",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-20 || target={{.+}}-apple-macosx{{15.[4-9](.\d+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-18",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-19 || target={{.+}}-apple-macosx{{15.[0-3](.\d+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-17",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.\d+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-16",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-15",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-14",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-15",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-13",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-    Feature(
-        name="_target-has-llvm-12",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}",
-            cfg.available_features,
-        ),
-    ),
-]
-
-# Define features for back-deployment testing.
-#
-# These features can be used to XFAIL tests that fail when deployed on (or compiled
-# for) an older system. For example, if a test exhibits a bug in the libc++ on a
-# particular system version, or if it uses a symbol that is not available on an
-# older version of the dylib, it can be marked as XFAIL with these features.
-#
-# We have two families of Lit features:
-#
-# The first one is `using-built-library-before-llvm-XYZ`. These features encode the
-# fact that the test suite is being *run* against a version of the shared/static library
-# that predates LLVM version XYZ. This is useful to represent the use case of compiling
-# a program against the latest libc++ but then deploying it and running it on an older
-# system with an older version of the (usually shared) library.
-#
-# This feature is built up using the target triple passed to the compiler and the
-# `stdlib=system` Lit feature, which encodes that we're running against the same library
-# as described by the target triple.
-#
-# The second set of features is `availability-<FEATURE>-missing`. This family of Lit
-# features encodes the presence of availability markup in the libc++ headers. This is
-# useful to check that a test fails specifically when compiled for a given deployment
-# target, such as when testing availability markup where we want to make sure that
-# using the annotated facility on a deployment target that doesn't support it will fail
-# at compile time. This can be achieved by creating a `.verify.cpp` test that checks for
-# the right errors and marking the test as `REQUIRES: availability-<FEATURE>-missing`.
-#
-# This feature is built up using the presence of availability markup detected inside
-# __config, the flavor of the library being tested and the target triple passed to the
-# compiler.
-#
-# Note that both families of Lit features are similar but different in important ways.
-# For example, tests for availability markup should be expected to produce diagnostics
-# regardless of whether we're running against a system library, as long as we're using
-# a libc++ flavor that enables availability markup. Similarly, a test could fail when
-# run against the system library of an older version of FreeBSD, even though FreeBSD
-# doesn't provide availability markup at the time of writing this.
-for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22"):
-    DEFAULT_FEATURES.append(
-        Feature(
-            name="using-built-library-before-llvm-{}".format(version),
-            when=lambda cfg, v=version: BooleanExpression.evaluate(
-                "stdlib=system && !_target-has-llvm-{}".format(v),
-                cfg.available_features,
-            ),
-        )
-    )
-
-DEFAULT_FEATURES += [
-    # Tests that require https://wg21.link/P0482 support in the built library
-    Feature(
-        name="availability-char8_t_support-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::to_chars(floating-point) in the built library
-    Feature(
-        name="availability-fp_to_chars-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require __libcpp_verbose_abort support in the built library
-    Feature(
-        name="availability-verbose_abort-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::pmr support in the built library
-    Feature(
-        name="availability-pmr-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require support for <print> and std::print in <ostream> in the built library.
-    Feature(
-        name="availability-print-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require time zone database support in the built library
-    Feature(
-        name="availability-tzdb-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::from_chars(floating-point) in the built library
-    Feature(
-        name="availability-fp_from_chars-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-20)",
-            cfg.available_features,
-        ),
-    ),
-]
diff --git a/libcxx/utils/libcxx/test/features/__init__.py b/libcxx/utils/libcxx/test/features/__init__.py
new file mode 100644
index 0000000000000..5c0d1f3aaafc6
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/__init__.py
@@ -0,0 +1,21 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from . import availability, compiler, gdb, libcxx_macros, localization, misc, platform
+
+# Lit features are evaluated in order. Some features depend on other features, so
+# we are careful to define them in the correct order. For example, several features
+# require the compiler detection to have been performed.
+DEFAULT_FEATURES = []
+DEFAULT_FEATURES += compiler.features
+DEFAULT_FEATURES += libcxx_macros.features
+DEFAULT_FEATURES += platform.features
+DEFAULT_FEATURES += localization.features
+DEFAULT_FEATURES += gdb.features
+DEFAULT_FEATURES += misc.features
+DEFAULT_FEATURES += availability.features
diff --git a/libcxx/utils/libcxx/test/features/availability.py b/libcxx/utils/libcxx/test/features/availability.py
new file mode 100644
index 0000000000000..c312a7cf830ed
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/availability.py
@@ -0,0 +1,199 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import Feature
+from lit.BooleanExpression import BooleanExpression
+
+# Helpers to define correspondances between LLVM versions and vendor system versions.
+# Those are used for backdeployment features below, do not use directly in tests.
+features = [
+    Feature(
+        name="_target-has-llvm-22",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "TBD",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-21",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "TBD",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-20",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-21 || target={{.+}}-apple-macosx{{26.[0-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-19",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-20 || target={{.+}}-apple-macosx{{15.[4-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-18",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-19 || target={{.+}}-apple-macosx{{15.[0-3](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-17",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-18 || target={{.+}}-apple-macosx{{14.[4-9](.\d+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-16",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-17 || target={{.+}}-apple-macosx{{14.[0-3](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-15",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-16 || target={{.+}}-apple-macosx{{13.[4-9](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-14",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-15",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-13",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-14 || target={{.+}}-apple-macosx{{13.[0-3](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+    Feature(
+        name="_target-has-llvm-12",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "_target-has-llvm-13 || target={{.+}}-apple-macosx{{12.[3-9](.[0-9]+)?}}",
+            cfg.available_features,
+        ),
+    ),
+]
+
+# Define features for back-deployment testing.
+#
+# These features can be used to XFAIL tests that fail when deployed on (or compiled
+# for) an older system. For example, if a test exhibits a bug in the libc++ on a
+# particular system version, or if it uses a symbol that is not available on an
+# older version of the dylib, it can be marked as XFAIL with these features.
+#
+# We have two families of Lit features:
+#
+# The first one is `using-built-library-before-llvm-XYZ`. These features encode the
+# fact that the test suite is being *run* against a version of the shared/static library
+# that predates LLVM version XYZ. This is useful to represent the use case of compiling
+# a program against the latest libc++ but then deploying it and running it on an older
+# system with an older version of the (usually shared) library.
+#
+# This feature is built up using the target triple passed to the compiler and the
+# `stdlib=system` Lit feature, which encodes that we're running against the same library
+# as described by the target triple.
+#
+# The second set of features is `availability-<FEATURE>-missing`. This family of Lit
+# features encodes the presence of availability markup in the libc++ headers. This is
+# useful to check that a test fails specifically when compiled for a given deployment
+# target, such as when testing availability markup where we want to make sure that
+# using the annotated facility on a deployment target that doesn't support it will fail
+# at compile time. This can be achieved by creating a `.verify.cpp` test that checks for
+# the right errors and marking the test as `REQUIRES: availability-<FEATURE>-missing`.
+#
+# This feature is built up using the presence of availability markup detected inside
+# __config, the flavor of the library being tested and the target triple passed to the
+# compiler.
+#
+# Note that both families of Lit features are similar but different in important ways.
+# For example, tests for availability markup should be expected to produce diagnostics
+# regardless of whether we're running against a system library, as long as we're using
+# a libc++ flavor that enables availability markup. Similarly, a test could fail when
+# run against the system library of an older version of FreeBSD, even though FreeBSD
+# doesn't provide availability markup at the time of writing this.
+for version in ("12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22"):
+    features.append(
+        Feature(
+            name="using-built-library-before-llvm-{}".format(version),
+            when=lambda cfg, v=version: BooleanExpression.evaluate(
+                "stdlib=system && !_target-has-llvm-{}".format(v),
+                cfg.available_features,
+            ),
+        )
+    )
+
+features += [
+    # Tests that require https://wg21.link/P0482 support in the built library
+    Feature(
+        name="availability-char8_t_support-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-12)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require std::to_chars(floating-point) in the built library
+    Feature(
+        name="availability-fp_to_chars-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-14)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require __libcpp_verbose_abort support in the built library
+    Feature(
+        name="availability-verbose_abort-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-15)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require std::pmr support in the built library
+    Feature(
+        name="availability-pmr-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-16)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require support for <print> and std::print in <ostream> in the built library.
+    Feature(
+        name="availability-print-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-18)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require time zone database support in the built library
+    Feature(
+        name="availability-tzdb-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-19)",
+            cfg.available_features,
+        ),
+    ),
+    # Tests that require std::from_chars(floating-point) in the built library
+    Feature(
+        name="availability-fp_from_chars-missing",
+        when=lambda cfg: BooleanExpression.evaluate(
+            "!libcpp-has-no-availability-markup && (stdlib=apple-libc++ && !_target-has-llvm-20)",
+            cfg.available_features,
+        ),
+    ),
+]
diff --git a/libcxx/utils/libcxx/test/features/compiler.py b/libcxx/utils/libcxx/test/features/compiler.py
new file mode 100644
index 0000000000000..2fb2d4b1502ad
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/compiler.py
@@ -0,0 +1,82 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import compilerMacros, Feature, AddCompileFlag, AddFeature
+
+_isAnyClang = lambda cfg: "__clang__" in compilerMacros(cfg)
+_isAppleClang = lambda cfg: "__apple_build_version__" in compilerMacros(cfg)
+_isAnyGCC = lambda cfg: "__GNUC__" in compilerMacros(cfg)
+_isClang = lambda cfg: _isAnyClang(cfg) and not _isAppleClang(cfg)
+_isGCC = lambda cfg: _isAnyGCC(cfg) and not _isAnyClang(cfg)
+_isAnyClangOrGCC = lambda cfg: _isAnyClang(cfg) or _isAnyGCC(cfg)
+_isClExe = lambda cfg: not _isAnyClangOrGCC(cfg)
+_isMSVC = lambda cfg: "_MSC_VER" in compilerMacros(cfg)
+_msvcVersion = lambda cfg: (int(compilerMacros(cfg)["_MSC_VER"]) // 100, int(compilerMacros(cfg)["_MSC_VER"]) % 100)
+
+features = [
+    # gcc-style-warnings detects compilers that understand -Wno-meow flags, unlike MSVC's compiler driver cl.exe.
+    Feature(name="gcc-style-warnings", when=_isAnyClangOrGCC),
+    Feature(name="cl-style-warnings", when=_isClExe),
+
+    Feature(name="apple-clang", when=_isAppleClang),
+    Feature(
+        name=lambda cfg: "apple-clang-{__clang_major__}".format(**compilerMacros(cfg)),
+        when=_isAppleClang,
+    ),
+    Feature(
+        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
+        when=_isAppleClang,
+    ),
+    Feature(
+        name=lambda cfg: "apple-clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
+        when=_isAppleClang,
+    ),
+    Feature(name="clang", when=_isClang),
+    Feature(
+        name=lambda cfg: "clang-{__clang_major__}".format(**compilerMacros(cfg)),
+        when=_isClang,
+    ),
+    Feature(
+        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}".format(**compilerMacros(cfg)),
+        when=_isClang,
+    ),
+    Feature(
+        name=lambda cfg: "clang-{__clang_major__}.{__clang_minor__}.{__clang_patchlevel__}".format(**compilerMacros(cfg)),
+        when=_isClang,
+    ),
+    # Note: Due to a GCC bug (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104760), we must disable deprecation warnings
+    #       on GCC or spurious diagnostics are issued.
+    #
+    # TODO:
+    # - Enable -Wplacement-new with GCC.
+    # - Enable -Wclass-memaccess with GCC.
+    Feature(
+        name="gcc",
+        when=_isGCC,
+        actions=[
+            AddCompileFlag("-D_LIBCPP_DISABLE_DEPRECATION_WARNINGS"),
+            AddCompileFlag("-Wno-placement-new"),
+            AddCompileFlag("-Wno-class-memaccess"),
+            AddFeature("GCC-ALWAYS_INLINE-FIXME"),
+        ],
+    ),
+    Feature(
+        name=lambda cfg: "gcc-{__GNUC__}".format(**compilerMacros(cfg)), when=_isGCC
+    ),
+    Feature(
+        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}".format(**compilerMacros(cfg)),
+        when=_isGCC,
+    ),
+    Feature(
+        name=lambda cfg: "gcc-{__GNUC__}.{__GNUC_MINOR__}.{__GNUC_PATCHLEVEL__}".format(**compilerMacros(cfg)),
+        when=_isGCC,
+    ),
+    Feature(name="msvc", when=_isMSVC),
+    Feature(name=lambda cfg: "msvc-{}".format(*_msvcVersion(cfg)), when=_isMSVC),
+    Feature(name=lambda cfg: "msvc-{}.{}".format(*_msvcVersion(cfg)), when=_isMSVC),
+]
diff --git a/libcxx/utils/libcxx/test/features/gdb.py b/libcxx/utils/libcxx/test/features/gdb.py
new file mode 100644
index 0000000000000..459a59afc32f4
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/gdb.py
@@ -0,0 +1,50 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import Feature, AddSubstitution
+import shutil
+import subprocess
+
+# Detect whether GDB is on the system, has Python scripting and supports
+# adding breakpoint commands. If so add a substitution to access it.
+def check_gdb(cfg):
+    gdb_path = shutil.which("gdb")
+    if gdb_path is None:
+        return False
+
+    # Check that we can set breakpoint commands, which was added in 8.3.
+    # Using the quit command here means that gdb itself exits, not just
+    # the "python <...>" command.
+    test_src = """\
+try:
+  gdb.Breakpoint(\"main\").commands=\"foo\"
+except AttributeError:
+  gdb.execute(\"quit 1\")
+gdb.execute(\"quit\")"""
+
+    try:
+        stdout = subprocess.check_output(
+            [gdb_path, "-ex", "python " + test_src, "--batch"],
+            stderr=subprocess.DEVNULL,
+            universal_newlines=True,
+        )
+    except subprocess.CalledProcessError:
+        # We can't set breakpoint commands
+        return False
+
+    # Check we actually ran the Python
+    return not "Python scripting is not supported" in stdout
+
+
+features = [
+    Feature(
+        name="host-has-gdb-with-python",
+        when=check_gdb,
+        actions=[AddSubstitution("%{gdb}", lambda cfg: shutil.which("gdb"))],
+    )
+]
diff --git a/libcxx/utils/libcxx/test/features/libcxx_macros.py b/libcxx/utils/libcxx/test/features/libcxx_macros.py
new file mode 100644
index 0000000000000..7a465f2e87866
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/libcxx_macros.py
@@ -0,0 +1,76 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import Feature, compilerMacros
+
+features = []
+
+# Deduce and add the test features that that are implied by the #defines in
+# the <__config> header.
+#
+# For each macro of the form `_LIBCPP_XXX_YYY_ZZZ` defined below that
+# is defined after including <__config>, add a Lit feature called
+# `libcpp-xxx-yyy-zzz`. When a macro is defined to a specific value
+# (e.g. `_LIBCPP_ABI_VERSION=2`), the feature is `libcpp-xxx-yyy-zzz=<value>`.
+#
+# Note that features that are more strongly tied to libc++ are named libcpp-foo,
+# while features that are more general in nature are not prefixed with 'libcpp-'.
+macros = {
+    "_LIBCPP_NO_VCRUNTIME": "libcpp-no-vcruntime",
+    "_LIBCPP_ABI_VERSION": "libcpp-abi-version",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS": "libcpp-has-abi-bounded-iterators",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STRING": "libcpp-has-abi-bounded-iterators-in-string",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR": "libcpp-has-abi-bounded-iterators-in-vector",
+    "_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY": "libcpp-has-abi-bounded-iterators-in-std-array",
+    "_LIBCPP_ABI_BOUNDED_UNIQUE_PTR": "libcpp-has-abi-bounded-unique_ptr",
+    "_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE": "libcpp-has-abi-fix-unordered-container-size-type",
+    "_LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR": "libcpp-deprecated-abi-disable-pair-trivial-copy-ctor",
+    "_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING": "libcpp-abi-no-compressed-pair-padding",
+    "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
+}
+for macro, feature in macros.items():
+    features.append(
+        Feature(
+            name=lambda cfg, m=macro, f=feature: f + ("={}".format(compilerMacros(cfg)[m]) if compilerMacros(cfg)[m] else ""),
+            when=lambda cfg, m=macro: m in compilerMacros(cfg),
+        )
+    )
+
+true_false_macros = {
+    "_LIBCPP_HAS_THREAD_API_EXTERNAL": "libcpp-has-thread-api-external",
+    "_LIBCPP_HAS_THREAD_API_PTHREAD": "libcpp-has-thread-api-pthread",
+}
+for macro, feature in true_false_macros.items():
+    features.append(
+        Feature(
+            name=feature,
+            when=lambda cfg, m=macro: m in compilerMacros(cfg)
+            and compilerMacros(cfg)[m] == "1",
+        )
+    )
+
+inverted_macros = {
+    "_LIBCPP_HAS_TIME_ZONE_DATABASE": "no-tzdb",
+    "_LIBCPP_HAS_FILESYSTEM": "no-filesystem",
+    "_LIBCPP_HAS_LOCALIZATION": "no-localization",
+    "_LIBCPP_HAS_THREADS": "no-threads",
+    "_LIBCPP_HAS_MONOTONIC_CLOCK": "no-monotonic-clock",
+    "_LIBCPP_HAS_WIDE_CHARACTERS": "no-wide-characters",
+    "_LIBCPP_HAS_VENDOR_AVAILABILITY_ANNOTATIONS": "libcpp-has-no-availability-markup",
+    "_LIBCPP_HAS_RANDOM_DEVICE": "no-random-device",
+    "_LIBCPP_HAS_UNICODE": "libcpp-has-no-unicode",
+    "_LIBCPP_HAS_TERMINAL": "no-terminal",
+}
+for macro, feature in inverted_macros.items():
+    features.append(
+        Feature(
+            name=feature,
+            when=lambda cfg, m=macro: m in compilerMacros(cfg)
+            and compilerMacros(cfg)[m] == "0",
+        )
+    )
diff --git a/libcxx/utils/libcxx/test/features/localization.py b/libcxx/utils/libcxx/test/features/localization.py
new file mode 100644
index 0000000000000..157c250429d27
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/localization.py
@@ -0,0 +1,142 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import compilerMacros, Feature, programSucceeds, hasAnyLocale, programOutput, AddSubstitution
+import re
+
+features = [
+    # Check for Glibc < 2.27, where the ru_RU.UTF-8 locale had
+    # mon_decimal_point == ".", which our tests don't handle.
+    Feature(
+        name="glibc-old-ru_RU-decimal-point",
+        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
+        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
+        and not programSucceeds(
+            cfg,
+            """
+            #include <locale.h>
+            #include <string.h>
+            int main(int, char**) {
+              setlocale(LC_ALL, "ru_RU.UTF-8");
+              return strcmp(localeconv()->mon_decimal_point, ",");
+            }
+          """,
+        ),
+    ),
+]
+
+# Mapping from canonical locale names (used in the tests) to possible locale
+# names on various systems. Each locale is considered supported if any of the
+# alternative names is supported.
+_locales = {
+    "en_US.UTF-8": ["en_US.UTF-8", "en_US.utf8", "English_United States.1252"],
+    "fr_FR.UTF-8": ["fr_FR.UTF-8", "fr_FR.utf8", "French_France.1252"],
+    "ja_JP.UTF-8": ["ja_JP.UTF-8", "ja_JP.utf8", "Japanese_Japan.923"],
+    "ru_RU.UTF-8": ["ru_RU.UTF-8", "ru_RU.utf8", "Russian_Russia.1251"],
+    "zh_CN.UTF-8": ["zh_CN.UTF-8", "zh_CN.utf8", "Chinese_China.936"],
+    "fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"],
+    "cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"],
+}
+_provide_locale_conversions = {
+    "fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"],
+    "ru_RU.UTF-8": ["mon_thousands_sep"],
+}
+for locale, alts in _locales.items():
+    # Note: Using alts directly in the lambda body here will bind it to the value at the
+    # end of the loop. Assigning it to a default argument works around this issue.
+    features.append(
+        Feature(
+            name="locale.{}".format(locale),
+            when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts),
+            actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction(
+                cfg, locale, alts, _provide_locale_conversions[locale]
+            )
+            if locale in _provide_locale_conversions
+            and ("_LIBCPP_HAS_WIDE_CHARACTERS" not in compilerMacros(cfg) or
+                 compilerMacros(cfg)["_LIBCPP_HAS_WIDE_CHARACTERS"] == "1")
+            else [],
+        ),
+    )
+
+# Provide environment locale conversions through substitutions to avoid platform specific
+# maintenance.
+def _getLocaleFlagsAction(cfg, locale, alts, members):
+    alts_list = ",".join([f'"{l}"' for l in alts])
+    get_member_list = ",".join([f"lc->{m}" for m in members])
+
+    localeconv_info = programOutput(
+        cfg,
+        r"""
+        #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
+        #define _CRT_SECURE_NO_WARNINGS
+        #endif
+        #include <stdio.h>
+        #include <locale.h>
+        #include <stdlib.h>
+        #include <wchar.h>
+
+        // Print each requested locale conversion member on separate lines.
+        int main(int, char**) {
+          const char* locales[] = { %s };
+          for (int loc_i = 0; loc_i < %d; ++loc_i) {
+            if (!setlocale(LC_ALL, locales[loc_i])) {
+              continue; // Choose first locale name that is recognized.
+            }
+
+            lconv* lc = localeconv();
+            const char* members[] = { %s };
+            for (size_t m_i = 0; m_i < %d; ++m_i) {
+              if (!members[m_i]) {
+                printf("\n"); // member value is an empty string
+                continue;
+              }
+
+              size_t len = mbstowcs(nullptr, members[m_i], 0);
+              if (len == static_cast<size_t>(-1)) {
+                fprintf(stderr, "mbstowcs failed unexpectedly\n");
+                return 1;
+              }
+              // Include room for null terminator. Use malloc as these features
+              // are also used by lit configs that don't use -lc++ (libunwind tests).
+              wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t));
+              size_t ret = mbstowcs(dst, members[m_i], len + 1);
+              if (ret == static_cast<size_t>(-1)) {
+                fprintf(stderr, "mbstowcs failed unexpectedly\n");
+                free(dst);
+                return 1;
+              }
+
+              for (size_t i = 0; i < len; ++i) {
+                if (dst[i] > 0x7F) {
+                  printf("\\u%%04x", dst[i]);
+                } else {
+                  // c++03 does not allow basic ascii-range characters in UCNs
+                  printf("%%c", (char)dst[i]);
+                }
+              }
+              printf("\n");
+              free(dst);
+            }
+            return 0;
+          }
+
+          return 1;
+        }
+        """
+        % (alts_list, len(alts), get_member_list, len(members)),
+    )
+    valid_define_name = re.sub(r"[.-]", "_", locale).upper()
+    return [
+        # Provide locale conversion through a substitution.
+        # Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f"
+        AddSubstitution(
+            f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}",
+            lambda cfg, value=value: f"'L\"{value}\"'",
+        )
+        for member, value in zip(members, localeconv_info.split("\n"))
+    ]
diff --git a/libcxx/utils/libcxx/test/features/misc.py b/libcxx/utils/libcxx/test/features/misc.py
new file mode 100644
index 0000000000000..738e3d8bb207c
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/misc.py
@@ -0,0 +1,299 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import compilerMacros, sourceBuilds, hasCompileFlag, programSucceeds, runScriptExitCode
+from libcxx.test.dsl import Feature, AddCompileFlag, AddLinkFlag
+import platform
+import sys
+
+def _mingwSupportsModules(cfg):
+    # Only mingw headers are known to work with libc++ built as a module,
+    # at the moment.
+    if not "__MINGW32__" in compilerMacros(cfg):
+        return False
+    # For mingw headers, check for a version known to support being built
+    # as a module.
+    return sourceBuilds(
+        cfg,
+        """
+        #include <_mingw_mac.h>
+        #if __MINGW64_VERSION_MAJOR < 12
+        #error Headers known to be incompatible
+        #elif __MINGW64_VERSION_MAJOR == 12
+        // The headers were fixed to work with libc++ modules during
+        // __MINGW64_VERSION_MAJOR == 12. The headers became compatible
+        // with libc++ built as a module in
+        // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the
+        // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed
+        // removed the now unused __mingw_static_ovr define. Use this
+        // as indicator for whether we've got new enough headers.
+        #ifdef __mingw_static_ovr
+        #error Headers too old
+        #endif
+        #else
+        // __MINGW64_VERSION_MAJOR > 12 should be ok.
+        #endif
+        int main(int, char**) { return 0; }
+        """,
+    )
+
+features = [
+    Feature(
+        name="diagnose-if-support",
+        when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"),
+        actions=[AddCompileFlag("-Wuser-defined-warnings")],
+    ),
+    Feature(
+        name="character-conversion-warnings",
+        when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"),
+    ),
+    # Tests to validate whether the compiler has a way to set the maximum number
+    # of steps during constant evaluation. Since the flag differs per compiler
+    # store the "valid" flag as a feature. This allows passing the proper compile
+    # flag to the compiler:
+    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=12345678
+    # // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=12345678
+    Feature(
+        name="has-fconstexpr-steps",
+        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-steps=1"),
+    ),
+    Feature(
+        name="has-fconstexpr-ops-limit",
+        when=lambda cfg: hasCompileFlag(cfg, "-fconstexpr-ops-limit=1"),
+    ),
+    Feature(name="has-fblocks", when=lambda cfg: hasCompileFlag(cfg, "-fblocks")),
+    Feature(
+        name="fdelayed-template-parsing",
+        when=lambda cfg: hasCompileFlag(cfg, "-fdelayed-template-parsing"),
+    ),
+    Feature(
+        name="has-fobjc-arc",
+        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc")
+        and sys.platform.lower().strip() == "darwin",
+    ),  # TODO: this doesn't handle cross-compiling to Apple platforms.
+    Feature(
+        name="objective-c++",
+        when=lambda cfg: hasCompileFlag(cfg, "-xobjective-c++ -fobjc-arc"),
+    ),
+    Feature(
+        name="verify-support",
+        when=lambda cfg: hasCompileFlag(cfg, "-Xclang -verify-ignore-unexpected"),
+    ),
+    Feature(
+        name="add-latomic-workaround",  # https://llvm.org/PR73361
+        when=lambda cfg: sourceBuilds(
+            cfg, "int main(int, char**) { return 0; }", ["-latomic"]
+        ),
+        actions=[AddLinkFlag("-latomic")],
+    ),
+    Feature(
+        name="has-64-bit-atomics",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <atomic>
+            struct Large { char storage[64/8]; };
+            std::atomic<Large> x;
+            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
+          """,
+        ),
+    ),
+    Feature(
+        name="has-1024-bit-atomics",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <atomic>
+            struct Large { char storage[1024/8]; };
+            std::atomic<Large> x;
+            int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
+          """,
+        ),
+    ),
+    # Tests that require 64-bit architecture
+    Feature(
+        name="32-bit-pointer",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            int main(int, char**) {
+              static_assert(sizeof(void *) == 4);
+            }
+          """,
+        ),
+    ),
+    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.20348.0):
+    # https://developercommunity.visualstudio.com/t/utf-8-locales-break-ctype-functions-for-wchar-type/1653678
+    Feature(
+        name="win32-broken-utf8-wchar-ctype",
+        when=lambda cfg: not "_LIBCPP_HAS_LOCALIZATION" in compilerMacros(cfg)
+        or compilerMacros(cfg)["_LIBCPP_HAS_LOCALIZATION"] == "1"
+        and "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <locale.h>
+            #include <wctype.h>
+            int main(int, char**) {
+              setlocale(LC_ALL, "en_US.UTF-8");
+              return towlower(L'\\xDA') != L'\\xFA';
+            }
+          """,
+        ),
+    ),
+    # Check for a Windows UCRT bug (fixed in UCRT/Windows 10.0.19041.0).
+    # https://developercommunity.visualstudio.com/t/printf-formatting-with-g-outputs-too/1660837
+    Feature(
+        name="win32-broken-printf-g-precision",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <stdio.h>
+            #include <string.h>
+            int main(int, char**) {
+              char buf[100];
+              snprintf(buf, sizeof(buf), "%#.*g", 0, 0.0);
+              return strcmp(buf, "0.");
+            }
+          """,
+        ),
+    ),
+    # Check for a Windows UCRT bug (not fixed upstream yet).
+    # With UCRT, printf("%a", 0.0) produces "0x0.0000000000000p+0",
+    # while other C runtimes produce just "0x0p+0".
+    # https://developercommunity.visualstudio.com/t/Printf-formatting-of-float-as-hex-prints/1660844
+    Feature(
+        name="win32-broken-printf-a-precision",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and not programSucceeds(
+            cfg,
+            """
+            #include <stdio.h>
+            #include <string.h>
+            int main(int, char**) {
+              char buf[100];
+              snprintf(buf, sizeof(buf), "%a", 0.0);
+              return strcmp(buf, "0x0p+0");
+            }
+          """,
+        ),
+    ),
+    Feature(
+        name="has-unix-headers",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <unistd.h>
+            #include <sys/wait.h>
+            int main(int, char**) {
+              int fd[2];
+              return pipe(fd);
+            }
+          """,
+        ),
+    ),
+    # Whether Bash can run on the executor.
+    # This is not always the case, for example when running on embedded systems.
+    #
+    # For the corner case of bash existing, but it being missing in the path
+    # set in %{exec} as "--env PATH=one-single-dir", the executor does find
+    # and executes bash, but bash then can't find any other common shell
+    # utilities. Test executing "bash -c 'bash --version'" to see if bash
+    # manages to find binaries to execute.
+    Feature(
+        name="executor-has-no-bash",
+        when=lambda cfg: runScriptExitCode(cfg, ["%{exec} bash -c 'bash --version'"]) != 0,
+    ),
+    # Whether module support for the platform is available.
+    Feature(
+        name="has-no-cxx-module-support",
+        # The libc of these platforms have functions with internal linkage.
+        # This is not allowed per C11 7.1.2 Standard headers/6
+        #  Any declaration of a library function shall have external linkage.
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
+        or "__FreeBSD__" in compilerMacros(cfg)
+        or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg))
+        or platform.system().lower().startswith("aix")
+        # Avoid building on platforms that don't support modules properly.
+        or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier")
+        # older versions don't support extern "C++", newer versions don't support main in named module.
+        or not (
+            sourceBuilds(
+                cfg,
+                """
+            export module test;
+            extern "C++" int main(int, char**) { return 0; }
+          """,
+            )
+            or sourceBuilds(
+                cfg,
+                """
+            export module test;
+            int main(int, char**) { return 0; }
+          """,
+            )
+        ),
+    ),
+    # The time zone validation tests compare the output of zdump against the
+    # output generated by <chrono>'s time zone support.
+    Feature(
+        name="has-no-zdump",
+        when=lambda cfg: runScriptExitCode(cfg, ["zdump --version"]) != 0,
+    ),
+    Feature(
+        name="can-create-symlinks",
+        when=lambda cfg: "_WIN32" not in compilerMacros(cfg)
+        or programSucceeds(
+            cfg,
+            # Creation of symlinks require elevated privileges on Windows unless
+            # Windows developer mode is enabled.
+            """
+            #include <stdio.h>
+            #include <windows.h>
+            int main(int, char**) {
+              CHAR tempDirPath[MAX_PATH];
+              DWORD tempPathRet = GetTempPathA(MAX_PATH, tempDirPath);
+              if (tempPathRet == 0 || tempPathRet > MAX_PATH) {
+                return 1;
+              }
+
+              CHAR tempFilePath[MAX_PATH];
+              UINT uRetVal = GetTempFileNameA(
+                tempDirPath,
+                "cxx", // Prefix
+                0, // Unique=0 also implies file creation.
+                tempFilePath);
+              if (uRetVal == 0) {
+                return 1;
+              }
+
+              CHAR symlinkFilePath[MAX_PATH];
+              int ret = sprintf_s(symlinkFilePath, MAX_PATH, "%s_symlink", tempFilePath);
+              if (ret == -1) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              // Requires either administrator, or developer mode enabled.
+              BOOL bCreatedSymlink = CreateSymbolicLinkA(symlinkFilePath,
+                tempFilePath,
+                SYMBOLIC_LINK_FLAG_ALLOW_UNPRIVILEGED_CREATE);
+              if (!bCreatedSymlink) {
+                DeleteFileA(tempFilePath);
+                return 1;
+              }
+
+              DeleteFileA(tempFilePath);
+              DeleteFileA(symlinkFilePath);
+              return 0;
+            }
+            """,
+        ),
+    ),
+]
diff --git a/libcxx/utils/libcxx/test/features/platform.py b/libcxx/utils/libcxx/test/features/platform.py
new file mode 100644
index 0000000000000..db9d3931da7ff
--- /dev/null
+++ b/libcxx/utils/libcxx/test/features/platform.py
@@ -0,0 +1,132 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from libcxx.test.dsl import programOutput, Feature, compilerMacros, programSucceeds, AddCompileFlag, sourceBuilds
+import platform
+import sys
+
+def _getAndroidDeviceApi(cfg):
+    return int(
+        programOutput(
+            cfg,
+            r"""
+                #include <android/api-level.h>
+                #include <stdio.h>
+                int main(int, char**) {
+                    printf("%d\n", android_get_device_api_level());
+                    return 0;
+                }
+            """,
+        )
+    )
+
+# Add features representing the target platform name: darwin, linux, windows, etc...
+features = [
+    Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)),
+    Feature(name="windows", when=lambda cfg: "_WIN32" in compilerMacros(cfg)),
+    Feature(
+        name="windows-dll",
+        when=lambda cfg: "_WIN32" in compilerMacros(cfg)
+        and sourceBuilds(
+            cfg,
+            """
+            #include <iostream>
+            int main(int, char**) { return 0; }
+          """,
+        )
+        and programSucceeds(
+            cfg,
+            """
+            #include <iostream>
+            #include <windows.h>
+            #include <winnt.h>
+            int main(int, char**) {
+              // Get a pointer to a data member that gets linked from the C++
+              // library. This must be a data member (functions can get
+              // thunk inside the calling executable), and must not be
+              // something that is defined inline in headers.
+              void *ptr = &std::cout;
+              // Get a handle to the current main executable.
+              void *exe = GetModuleHandle(NULL);
+              // The handle points at the PE image header. Navigate through
+              // the header structure to find the size of the PE image (the
+              // executable).
+              PIMAGE_DOS_HEADER dosheader = (PIMAGE_DOS_HEADER)exe;
+              PIMAGE_NT_HEADERS ntheader = (PIMAGE_NT_HEADERS)((BYTE *)dosheader + dosheader->e_lfanew);
+              PIMAGE_OPTIONAL_HEADER peheader = &ntheader->OptionalHeader;
+              void *exeend = (BYTE*)exe + peheader->SizeOfImage;
+              // Check if the tested pointer - the data symbol from the
+              // C++ library - is located within the exe.
+              if (ptr >= exe && ptr <= exeend)
+                return 1;
+              // Return success if it was outside of the executable, i.e.
+              // loaded from a DLL.
+              return 0;
+            }
+          """,
+        ),
+        actions=[AddCompileFlag("-DTEST_WINDOWS_DLL")],
+    ),
+    Feature(name="linux", when=lambda cfg: "__linux__" in compilerMacros(cfg)),
+    Feature(name="android", when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)),
+    Feature(
+        name=lambda cfg: "android-device-api={}".format(_getAndroidDeviceApi(cfg)),
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
+    ),
+    Feature(
+        name="LIBCXX-ANDROID-FIXME",
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg),
+    ),
+    Feature(name="netbsd", when=lambda cfg: "__NetBSD__" in compilerMacros(cfg)),
+    Feature(name="freebsd", when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg)),
+    Feature(
+        name="LIBCXX-FREEBSD-FIXME",
+        when=lambda cfg: "__FreeBSD__" in compilerMacros(cfg),
+    ),
+    Feature(
+        name="LIBCXX-PICOLIBC-FIXME",
+        when=lambda cfg: sourceBuilds(
+            cfg,
+            """
+            #include <string.h>
+            #ifndef __PICOLIBC__
+            #error not picolibc
+            #endif
+            int main(int, char**) { return 0; }
+          """,
+        ),
+    ),
+    Feature(
+        name="LIBCXX-AMDGPU-FIXME",
+        when=lambda cfg: "__AMDGPU__" in compilerMacros(cfg),
+    ),
+    Feature(
+        name="LIBCXX-NVPTX-FIXME",
+        when=lambda cfg: "__NVPTX__" in compilerMacros(cfg),
+    ),
+]
+
+# Add features representing the build host platform name.
+# The build host could differ from the target platform for cross-compilation.
+features += [
+    Feature(name="buildhost={}".format(sys.platform.lower().strip())),
+    # sys.platform can often be represented by a "sub-system", such as 'win32', 'cygwin', 'mingw', freebsd13 & etc.
+    # We define a consolidated feature on a few platforms.
+    Feature(
+        name="buildhost=windows",
+        when=lambda cfg: platform.system().lower().startswith("windows"),
+    ),
+    Feature(
+        name="buildhost=freebsd",
+        when=lambda cfg: platform.system().lower().startswith("freebsd"),
+    ),
+    Feature(
+        name="buildhost=aix",
+        when=lambda cfg: platform.system().lower().startswith("aix"),
+    ),
+]
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index c02d6df1c47a4..299aa28777fd5 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 
 from libcxx.test.dsl import *
-from libcxx.test.features import _isClang, _isAppleClang, _isGCC, _isMSVC
+from libcxx.test.features.compiler import _isClang, _isAppleClang, _isGCC, _isMSVC
 
 
 _warningFlags = [
diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
index 5c1b7d4943b3f..2957cb716041d 100644
--- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
+++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp
@@ -1328,32 +1328,36 @@ class Executor {
         m_emu, inst, 8, ZextD,
         [](uint64_t a, uint64_t b) { return std::max(a, b); });
   }
-  template <typename T>
-  bool F_Load(T inst, const fltSemantics &(*semantics)(),
-              unsigned int numBits) {
+  template <typename I, typename T>
+  bool F_Load(I inst, const fltSemantics &(*semantics)()) {
     return transformOptional(inst.rs1.Read(m_emu),
                              [&](auto &&rs1) {
-                               uint64_t addr = rs1 + uint64_t(inst.imm);
-                               uint64_t bits = *m_emu.ReadMem<uint64_t>(addr);
+                               uint64_t addr =
+                                   rs1 + uint64_t(SignExt(inst.imm));
+                               uint64_t bits = *m_emu.ReadMem<T>(addr);
+                               unsigned numBits = sizeof(T) * 8;
                                APFloat f(semantics(), APInt(numBits, bits));
                                return inst.rd.WriteAPFloat(m_emu, f);
                              })
         .value_or(false);
   }
-  bool operator()(FLW inst) { return F_Load(inst, &APFloat::IEEEsingle, 32); }
-  template <typename T> bool F_Store(T inst, bool isDouble) {
+  bool operator()(FLW inst) {
+    return F_Load<FLW, uint32_t>(inst, &APFloat::IEEEsingle);
+  }
+  template <typename I, typename T> bool F_Store(I inst, bool isDouble) {
     return transformOptional(zipOpt(inst.rs1.Read(m_emu),
                                     inst.rs2.ReadAPFloat(m_emu, isDouble)),
                              [&](auto &&tup) {
                                auto [rs1, rs2] = tup;
-                               uint64_t addr = rs1 + uint64_t(inst.imm);
+                               uint64_t addr =
+                                   rs1 + uint64_t(SignExt(inst.imm));
                                uint64_t bits =
                                    rs2.bitcastToAPInt().getZExtValue();
-                               return m_emu.WriteMem<uint64_t>(addr, bits);
+                               return m_emu.WriteMem<T>(addr, bits);
                              })
         .value_or(false);
   }
-  bool operator()(FSW inst) { return F_Store(inst, false); }
+  bool operator()(FSW inst) { return F_Store<FSW, uint32_t>(inst, false); }
   std::tuple<bool, APFloat> FusedMultiplyAdd(APFloat rs1, APFloat rs2,
                                              APFloat rs3) {
     auto opStatus = rs1.fusedMultiplyAdd(rs2, rs3, m_emu.GetRoundingMode());
@@ -1616,8 +1620,10 @@ class Executor {
   bool operator()(FCVT_S_LU inst) {
     return FCVT_f2i(inst, &Rs::Read, APFloat::IEEEsingle());
   }
-  bool operator()(FLD inst) { return F_Load(inst, &APFloat::IEEEdouble, 64); }
-  bool operator()(FSD inst) { return F_Store(inst, true); }
+  bool operator()(FLD inst) {
+    return F_Load<FLD, uint64_t>(inst, &APFloat::IEEEdouble);
+  }
+  bool operator()(FSD inst) { return F_Store<FSD, uint64_t>(inst, true); }
   bool operator()(FMADD_D inst) { return FMA(inst, true, 1.0f, 1.0f); }
   bool operator()(FMSUB_D inst) { return FMA(inst, true, 1.0f, -1.0f); }
   bool operator()(FNMSUB_D inst) { return FMA(inst, true, -1.0f, 1.0f); }
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
index 57d88f615e2b3..22b9711fda480 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
@@ -15,6 +15,8 @@
 #include "lldb/Utility/UriParser.h"
 #include "lldb/ValueObject/ValueObject.h"
 
+#include "llvm/ADT/DenseMap.h"
+
 #include "AdbClient.h"
 #include "PlatformAndroid.h"
 #include "PlatformAndroidRemoteGDBServer.h"
@@ -479,136 +481,90 @@ std::string PlatformAndroid::GetRunAs() {
   return run_as.str();
 }
 
-// Helper function to populate process status information from
-// /proc/[pid]/status
-void PlatformAndroid::PopulateProcessStatusInfo(
-    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Read /proc/[pid]/status to get parent PID, UIDs, and GIDs
-  Status error;
-  AdbClientUP status_adb = GetAdbClient(error);
-  if (error.Fail())
-    return;
-
-  std::string status_output;
-  StreamString status_cmd;
-  status_cmd.Printf(
-      "cat /proc/%llu/status 2>/dev/null | grep -E '^(PPid|Uid|Gid):'",
-      static_cast<unsigned long long>(pid));
-  Status status_error =
-      status_adb->Shell(status_cmd.GetData(), seconds(5), &status_output);
+static bool NeedsCmdlineSupplement(const ProcessInstanceInfo &proc_info) {
+  llvm::StringRef name =
+      proc_info.GetExecutableFile().GetFilename().GetStringRef();
+  return name.contains("app_process") || name.contains("zygote");
+}
 
-  if (status_error.Fail() || status_output.empty())
+// Fetch /proc/PID/cmdline for processes to get actual package names.
+// Android apps often show as "zygote" or "app_process" without this.
+static void SupplementWithCmdlineInfo(ProcessInstanceInfoList &proc_infos,
+                                      AdbClient *adb) {
+  if (proc_infos.empty())
     return;
 
-  llvm::SmallVector<llvm::StringRef, 16> lines;
-  llvm::StringRef(status_output).split(lines, '\n');
-
-  for (llvm::StringRef line : lines) {
-    line = line.trim();
-    if (line.starts_with("PPid:")) {
-      llvm::StringRef ppid_str = line.substr(5).trim();
-      lldb::pid_t ppid;
-      if (llvm::to_integer(ppid_str, ppid))
-        process_info.SetParentProcessID(ppid);
-    } else if (line.starts_with("Uid:")) {
-      llvm::SmallVector<llvm::StringRef, 4> uid_parts;
-      line.substr(4).trim().split(uid_parts, '\t', -1, false);
-      if (uid_parts.size() >= 2) {
-        uint32_t uid, euid;
-        if (llvm::to_integer(uid_parts[0].trim(), uid))
-          process_info.SetUserID(uid);
-        if (llvm::to_integer(uid_parts[1].trim(), euid))
-          process_info.SetEffectiveUserID(euid);
-      }
-    } else if (line.starts_with("Gid:")) {
-      llvm::SmallVector<llvm::StringRef, 4> gid_parts;
-      line.substr(4).trim().split(gid_parts, '\t', -1, false);
-      if (gid_parts.size() >= 2) {
-        uint32_t gid, egid;
-        if (llvm::to_integer(gid_parts[0].trim(), gid))
-          process_info.SetGroupID(gid);
-        if (llvm::to_integer(gid_parts[1].trim(), egid))
-          process_info.SetEffectiveGroupID(egid);
-      }
+  llvm::DenseMap<lldb::pid_t, ProcessInstanceInfo *> pid_map;
+  std::string pid_list;
+  for (auto &proc_info : proc_infos) {
+    if (NeedsCmdlineSupplement(proc_info)) {
+      lldb::pid_t pid = proc_info.GetProcessID();
+      pid_map[pid] = &proc_info;
+      if (!pid_list.empty())
+        pid_list += " ";
+      pid_list += std::to_string(pid);
     }
   }
-}
 
-// Helper function to populate command line arguments from /proc/[pid]/cmdline
-void PlatformAndroid::PopulateProcessCommandLine(
-    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Read /proc/[pid]/cmdline to get command line arguments
-  Status error;
-  AdbClientUP cmdline_adb = GetAdbClient(error);
-  if (error.Fail())
+  if (pid_list.empty())
     return;
 
+  Log *log = GetLog(LLDBLog::Platform);
+
+  // Use xargs -P to parallelize cmdline fetching (up to 8 concurrent reads)
+  StreamString cmd;
+  cmd.Printf(
+      "echo '%s' | xargs -n 1 -P 8 sh -c "
+      "'echo \"$1:$(cat /proc/$1/cmdline 2>/dev/null | tr \"\\0\" \" \")\"' sh",
+      pid_list.c_str());
+
   std::string cmdline_output;
-  StreamString cmdline_cmd;
-  cmdline_cmd.Printf("cat /proc/%llu/cmdline 2>/dev/null | tr '\\000' ' '",
-                     static_cast<unsigned long long>(pid));
-  Status cmdline_error =
-      cmdline_adb->Shell(cmdline_cmd.GetData(), seconds(5), &cmdline_output);
+  Status error = adb->Shell(cmd.GetData(), seconds(5), &cmdline_output);
 
-  if (cmdline_error.Fail() || cmdline_output.empty())
+  if (error.Fail() || cmdline_output.empty())
     return;
 
-  cmdline_output = llvm::StringRef(cmdline_output).trim().str();
-  if (cmdline_output.empty())
-    return;
+  llvm::SmallVector<llvm::StringRef, 256> lines;
+  llvm::StringRef(cmdline_output).split(lines, '\n', -1, false);
 
-  llvm::SmallVector<llvm::StringRef, 16> args;
-  llvm::StringRef(cmdline_output).split(args, ' ', -1, false);
-  if (args.empty())
-    return;
+  for (llvm::StringRef line : lines) {
+    line = line.trim();
+    auto [pid_str, cmdline] = line.split(':');
+    if (pid_str.empty() || cmdline.empty())
+      continue;
 
-  process_info.SetArg0(args[0]);
-  Args process_args;
-  for (size_t i = 1; i < args.size(); i++) {
-    if (!args[i].empty())
-      process_args.AppendArgument(args[i]);
-  }
-  process_info.SetArguments(process_args, false);
-}
+    cmdline = cmdline.trim();
 
-// Helper function to populate architecture from /proc/[pid]/exe
-void PlatformAndroid::PopulateProcessArchitecture(
-    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Read /proc/[pid]/exe to get executable path for architecture detection
-  Status error;
-  AdbClientUP exe_adb = GetAdbClient(error);
-  if (error.Fail())
-    return;
+    lldb::pid_t pid;
+    if (!llvm::to_integer(pid_str, pid) || cmdline.empty())
+      continue;
 
-  std::string exe_output;
-  StreamString exe_cmd;
-  exe_cmd.Printf("readlink /proc/%llu/exe 2>/dev/null",
-                 static_cast<unsigned long long>(pid));
-  Status exe_error = exe_adb->Shell(exe_cmd.GetData(), seconds(5), &exe_output);
+    auto it = pid_map.find(pid);
+    if (it == pid_map.end())
+      continue;
 
-  if (exe_error.Fail() || exe_output.empty())
-    return;
+    ProcessInstanceInfo *proc_info = it->second;
+    llvm::SmallVector<llvm::StringRef, 16> args;
+    cmdline.split(args, ' ', -1, false);
 
-  exe_output = llvm::StringRef(exe_output).trim().str();
-
-  // Determine architecture from exe path
-  ArchSpec arch;
-  if (exe_output.find("64") != std::string::npos ||
-      exe_output.find("arm64") != std::string::npos ||
-      exe_output.find("aarch64") != std::string::npos) {
-    arch.SetTriple("aarch64-unknown-linux-android");
-  } else if (exe_output.find("x86_64") != std::string::npos) {
-    arch.SetTriple("x86_64-unknown-linux-android");
-  } else if (exe_output.find("x86") != std::string::npos ||
-             exe_output.find("i686") != std::string::npos) {
-    arch.SetTriple("i686-unknown-linux-android");
-  } else {
-    // Default to armv7 for 32-bit ARM (most common on Android)
-    arch.SetTriple("armv7-unknown-linux-android");
-  }
+    if (!args.empty()) {
+      proc_info->GetExecutableFile().SetFile(args[0], FileSpec::Style::posix);
+
+      if (args.size() > 1) {
+        Args process_args;
+        for (size_t i = 1; i < args.size(); ++i) {
+          if (!args[i].empty())
+            process_args.AppendArgument(args[i]);
+        }
+        proc_info->SetArguments(process_args, false);
+      }
 
-  if (arch.IsValid())
-    process_info.SetArchitecture(arch);
+      LLDB_LOGF(log,
+                "PlatformAndroid::%s supplemented PID %llu with cmdline: %s",
+                __FUNCTION__, static_cast<unsigned long long>(pid),
+                cmdline.str().c_str());
+    }
+  }
 }
 
 uint32_t
@@ -616,109 +572,39 @@ PlatformAndroid::FindProcesses(const ProcessInstanceInfoMatch &match_info,
                                ProcessInstanceInfoList &proc_infos) {
   proc_infos.clear();
 
-  // When LLDB is running natively on an Android device (IsHost() == true),
-  // use the parent class's standard Linux /proc enumeration. IsHost() is only
-  // true when compiled for Android (#if defined(__ANDROID__)), so calling
-  // PlatformLinux methods is safe (Android is Linux-based).
   if (IsHost())
     return PlatformLinux::FindProcesses(match_info, proc_infos);
 
-  // Remote Android platform: implement process name lookup using 'pidof' over
-  // adb.
-
-  // LLDB stores the search name in GetExecutableFile() (even though it's
-  // actually a process name like "com.android.chrome" rather than an
-  // executable path). If no search name is provided, we can't use
-  // 'pidof', so return early with no results.
-  const ProcessInstanceInfo &match_process_info = match_info.GetProcessInfo();
-  if (!match_process_info.GetExecutableFile() ||
-      match_info.GetNameMatchType() == NameMatch::Ignore) {
-    return 0;
-  }
-
-  // Extract the process name to search for (typically an Android package name
-  // like "com.example.app" or binary name like "app_process64")
-  std::string process_name = match_process_info.GetExecutableFile().GetPath();
-  if (process_name.empty())
-    return 0;
-
-  // Use adb to find the process by name
-  Status error;
-  AdbClientUP adb(GetAdbClient(error));
-  if (error.Fail()) {
-    Log *log = GetLog(LLDBLog::Platform);
-    LLDB_LOGF(log, "PlatformAndroid::%s failed to get ADB client: %s",
-              __FUNCTION__, error.AsCString());
-    return 0;
-  }
-
-  // Use 'pidof' command to get PIDs for the process name.
-  // Quote the process name to handle special characters (spaces, etc.)
-  std::string pidof_output;
-  StreamString command;
-  command.Printf("pidof '%s'", process_name.c_str());
-  error = adb->Shell(command.GetData(), seconds(5), &pidof_output);
-
-  if (error.Fail()) {
-    Log *log = GetLog(LLDBLog::Platform);
-    LLDB_LOG(log, "PlatformAndroid::{} 'pidof {}' failed: {}", __FUNCTION__,
-             process_name.c_str(), error.AsCString());
-    return 0;
-  }
-
-  // Parse PIDs from pidof output.
-  // Note: pidof can return multiple PIDs (space-separated) if multiple
-  // instances of the same executable are running.
-  pidof_output = llvm::StringRef(pidof_output).trim().str();
-  if (pidof_output.empty()) {
-    Log *log = GetLog(LLDBLog::Platform);
-    LLDB_LOGF(log, "PlatformAndroid::%s no process found with name '%s'",
-              __FUNCTION__, process_name.c_str());
+  if (!m_remote_platform_sp)
     return 0;
-  }
-
-  // Split the output by whitespace to handle multiple PIDs
-  llvm::SmallVector<llvm::StringRef, 8> pid_strings;
-  llvm::StringRef(pidof_output).split(pid_strings, ' ', -1, false);
-
-  Log *log = GetLog(LLDBLog::Platform);
-
-  // Process each PID and gather information
-  uint32_t num_matches = 0;
-  for (llvm::StringRef pid_str : pid_strings) {
-    pid_str = pid_str.trim();
-    if (pid_str.empty())
-      continue;
-
-    lldb::pid_t pid;
-    if (!llvm::to_integer(pid_str, pid)) {
-      LLDB_LOGF(log, "PlatformAndroid::%s failed to parse PID from: '%s'",
-                __FUNCTION__, pid_str.str().c_str());
-      continue;
-    }
-
-    ProcessInstanceInfo process_info;
-    process_info.SetProcessID(pid);
-    process_info.GetExecutableFile().SetFile(process_name,
-                                             FileSpec::Style::posix);
-
-    // Populate additional process information
-    PopulateProcessStatusInfo(pid, process_info);
-    PopulateProcessCommandLine(pid, process_info);
-    PopulateProcessArchitecture(pid, process_info);
-
-    // Check if this process matches the criteria
-    if (match_info.Matches(process_info)) {
-      proc_infos.push_back(process_info);
-      num_matches++;
 
-      LLDB_LOGF(log, "PlatformAndroid::%s found process '%s' with PID %llu",
-                __FUNCTION__, process_name.c_str(),
-                static_cast<unsigned long long>(pid));
+  // Android-specific process name handling:
+  // Apps spawned from zygote initially appear as "app_process" or "zygote"
+  // in the process list, but their actual package names (e.g.,
+  // "com.example.app") are only available in /proc/PID/cmdline. To support
+  // name-based matching, we must first fetch cmdline info for all processes,
+  // then apply the original name filter.
+  ProcessInstanceInfoMatch broad_match_info = match_info;
+  broad_match_info.SetNameMatchType(NameMatch::Ignore);
+
+  ProcessInstanceInfoList all_procs;
+  uint32_t count =
+      m_remote_platform_sp->FindProcesses(broad_match_info, all_procs);
+
+  if (count > 0) {
+    Status error;
+    AdbClientUP adb(GetAdbClient(error));
+    if (error.Success())
+      SupplementWithCmdlineInfo(all_procs, adb.get());
+
+    // Apply the original name matching against supplemented process info.
+    for (auto &proc_info : all_procs) {
+      if (match_info.Matches(proc_info))
+        proc_infos.push_back(proc_info);
     }
   }
 
-  return num_matches;
+  return proc_infos.size();
 }
 
 std::unique_ptr<AdbSyncService> PlatformAndroid::GetSyncService(Status &error) {
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
index e771c6ae97d4d..c6a412b39d410 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
@@ -60,7 +60,7 @@ class PlatformAndroid : public platform_linux::PlatformLinux {
   uint32_t GetDefaultMemoryCacheLineSize() override;
 
   uint32_t FindProcesses(const ProcessInstanceInfoMatch &match_info,
-                         ProcessInstanceInfoList &proc_infos) override;
+                         ProcessInstanceInfoList &process_infos) override;
 
 protected:
   const char *GetCacheHostname() override;
@@ -86,17 +86,8 @@ class PlatformAndroid : public platform_linux::PlatformLinux {
 protected:
   virtual std::unique_ptr<AdbSyncService> GetSyncService(Status &error);
 
-private:
   std::string m_device_id;
   uint32_t m_sdk_version;
-
-  // Helper functions for process information gathering
-  void PopulateProcessStatusInfo(lldb::pid_t pid,
-                                 ProcessInstanceInfo &process_info);
-  void PopulateProcessCommandLine(lldb::pid_t pid,
-                                  ProcessInstanceInfo &process_info);
-  void PopulateProcessArchitecture(lldb::pid_t pid,
-                                   ProcessInstanceInfo &process_info);
 };
 
 } // namespace platform_android
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
index b5d82682199db..7cf21431efecd 100644
--- a/llvm/benchmarks/SpecialCaseListBM.cpp
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -5,7 +5,6 @@
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <iterator>
 #include <random>
 #include <string>
 #include <utility>
diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index 9555fadda6e47..c10a38c8ce4cb 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -11,7 +11,6 @@
 
 #include <cassert>
 #include <type_traits>
-#include <utility>
 
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/bit.h"
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 0cc03cf7a692a..9ee5f594ea56a 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -24,7 +24,6 @@
 #include <iomanip>
 #include <mutex>
 #include <sstream>
-#include <type_traits>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/PointerSumType.h b/llvm/include/llvm/ADT/PointerSumType.h
index c4971bf3af87a..c8e6cffd796a6 100644
--- a/llvm/include/llvm/ADT/PointerSumType.h
+++ b/llvm/include/llvm/ADT/PointerSumType.h
@@ -15,7 +15,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <type_traits>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h
index c129f3a695b9e..0fde14126c79b 100644
--- a/llvm/include/llvm/ADT/SetVector.h
+++ b/llvm/include/llvm/ADT/SetVector.h
@@ -28,7 +28,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
-#include <iterator>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ADT/TinyPtrVector.h b/llvm/include/llvm/ADT/TinyPtrVector.h
index 8d7a07b5e9eb5..ed08ec8a966c7 100644
--- a/llvm/include/llvm/ADT/TinyPtrVector.h
+++ b/llvm/include/llvm/ADT/TinyPtrVector.h
@@ -15,7 +15,6 @@
 #include <cassert>
 #include <cstddef>
 #include <iterator>
-#include <type_traits>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h
index 68ddcf753b59f..787793501f98a 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -24,7 +24,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <cassert>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
index 871dd95c365e8..1483588581f4e 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -24,7 +24,6 @@
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <utility>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 98565f423df3e..4d5d1fc7dfadc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -58,7 +58,6 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
-#include <iterator>
 #include <map>
 #include <string>
 #include <utility>
diff --git a/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h b/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h
index 5e0779157473e..8fde15d342a15 100644
--- a/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h
+++ b/llvm/include/llvm/DWARFLinker/IndexedValuesMap.h
@@ -12,7 +12,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cstdint>
-#include <utility>
 
 namespace llvm {
 namespace dwarf_linker {
diff --git a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
index b769e53d80270..7a1008689296d 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -15,7 +15,6 @@
 
 #include "llvm/Support/Compiler.h"
 #include <cinttypes>
-#include <type_traits>
 
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/Endian.h"
diff --git a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
index 76a019ddf8f34..17b5bfac9ac31 100644
--- a/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
+++ b/llvm/include/llvm/DebugInfo/PDB/Native/FormatUtil.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/FormatVariadic.h"
 
 #include <string>
-#include <type_traits>
 
 namespace llvm {
 namespace pdb {
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h
new file mode 100644
index 0000000000000..a996dfd9543df
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_systemz.h
@@ -0,0 +1,39 @@
+//===--- ELF_systemz.h - JIT link functions for ELF/systemz --*- C++ -*----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+//
+// jit-link functions for ELF/systemz.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H
+#define LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+namespace llvm {
+namespace jitlink {
+
+/// Create a LinkGraph from an ELF/systemz relocatable object
+///
+/// Note: The graph does not take ownership of the underlying buffer, nor copy
+/// its contents. The caller is responsible for ensuring that the object buffer
+/// outlives the graph.
+Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromELFObject_systemz(
+    MemoryBufferRef ObjectBuffer, std::shared_ptr<orc::SymbolStringPool> SSP);
+
+/// jit-link the given object buffer, which must be a ELF systemz relocatable
+/// object file.
+void link_ELF_systemz(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx);
+
+} // end namespace jitlink
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_ELF_SYSTEMZ_H
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
new file mode 100644
index 0000000000000..dde3448cd5da7
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/systemz.h
@@ -0,0 +1,924 @@
+//=== systemz.h - Generic JITLink systemz edge kinds, utilities -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing systemz objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H
+#define LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H
+
+#include "TableManager.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+
+using namespace llvm::support::endian;
+
+namespace llvm {
+namespace jitlink {
+namespace systemz {
+
+/// Represents systemz fixups and other systemz-specific edge kinds.
+enum EdgeKind_systemz : Edge::Kind {
+
+  /// A plain 64-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint64
+  ///
+  Pointer64 = Edge::FirstRelocation,
+
+  /// A plain 32-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint32
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 32-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer32,
+
+  /// A plain 20-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint20
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 20-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer20,
+
+  /// A plain 16-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint16
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 16-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer16,
+
+  /// A plain 12-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint12
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 12-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer12,
+
+  /// A plain 8-bit pointer value relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : uint8
+  ///
+  /// Errors:
+  ///   - The target must reside in the low 8-bits of the address space,
+  ///     otherwise an out-of-range error will be returned.
+  ///
+  Pointer8,
+
+  /// A 64-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int64
+  ///
+  Delta64,
+
+  /// A 32-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32,
+
+  /// A 16-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16,
+
+  /// A 32-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int33, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta32dbl,
+
+  /// A 24-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int24
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int25, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta24dbl,
+
+  /// A 16-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int17, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta16dbl,
+
+  /// A 12-bit delta shifted by 1.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int12
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int13, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta12dbl,
+
+  /// A 64-bit negative delta.
+  ///
+  /// Delta from target back to the fixup.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Fixup - Target + Addend : int64
+  ///
+  NegDelta64,
+
+  /// A 32-bit negative delta.
+  ///
+  /// Delta from the target back to the fixup.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Fixup - Target + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  NegDelta32,
+
+  /// A 32-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int33, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT32dbl,
+
+  /// A 24-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int24
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int25, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT24dbl,
+
+  /// A 16-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int17, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT16dbl,
+
+  /// A 12-bit Delta shifted by 1.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (Target - Fixup + Addend) >> 1 : int12
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int13, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  DeltaPLT12dbl,
+
+  /// A 64-bit Delta.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int64
+  ///
+  DeltaPLT64,
+
+  /// A 32-bit Delta.
+  ///
+  /// Delta from the fixup to the PLT slot for the target. This will lead to
+  /// creation of a PLT stub.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  DeltaPLT32,
+
+  /// A 64-bit offset from GOT to PLT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int64
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///
+  Delta64PLTFromGOT,
+
+  /// A 32-bit offset from GOT to PLT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32PLTFromGOT,
+
+  /// A 16-bit offset from GOT to PLT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int16
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16PLTFromGOT,
+
+  /// A 64-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int64
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///
+  Delta64FromGOT,
+
+  /// A 32-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32FromGOT,
+
+  /// A 16-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int16
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16FromGOT,
+
+  /// A 20-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int20
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta20FromGOT,
+
+  /// A 12-bit offset from GOT.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - GOTBase + Addend : int12
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta12FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta64FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta64FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta64FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta32FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta32FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta32FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta20FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta20FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta20FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta16FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta16FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///
+  RequestGOTAndTransformToDelta16FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta12FromGOT pointing
+  /// at the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta12FromGOT
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does
+  /// not already exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToDelta12FromGOT,
+
+  /// A GOT entry getter/constructor, transformed to Delta32dbl pointing at
+  /// the GOT entry for the original target.
+  ///
+  /// Indicates that this edge should be transformed into a Delta32dbl targeting
+  /// the GOT entry for the edge's current target, maintaining the same addend.
+  /// A GOT entry for the target should be created if one does not already
+  /// exist.
+  ///
+  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToDelta32dbl,
+
+  /// A 32-bit Delta to GOT base.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- GOTBase - Fixup + Addend : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta32GOTBase,
+
+  /// A 32-bit Delta to GOT base shifted by 1.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- (GOTBase - Fixup + Addend) >> 1 : int32
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to a null pointer GOTSymbol, which the GOT section
+  ///     symbol was not been defined.
+  ///   - The result of the fixup expression before shifting right by 1 must
+  ///     fit into an int33, otherwise an out-of-range error will be returned.
+  ///   - The result of the fixup expression  before shifting right by 1 must
+  ///     be multiple of 2, otherwise an alignment error will be returned.
+  ///
+  Delta32dblGOTBase,
+
+};
+
+/// Returns a string name for the given systemz edge. For debugging purposes
+/// only
+const char *getEdgeKindName(Edge::Kind K);
+
+/// Apply fixup expression for edge to block content.
+inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
+                        const Symbol *GOTSymbol) {
+  using namespace support;
+
+  char *BlockWorkingMem = B.getAlreadyMutableContent().data();
+  char *FixupPtr = BlockWorkingMem + E.getOffset();
+  orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset();
+  int64_t S = E.getTarget().getAddress().getValue();
+  int64_t A = E.getAddend();
+  int64_t P = FixupAddress.getValue();
+  int64_t GOTBase = GOTSymbol ? GOTSymbol->getAddress().getValue() : 0;
+  Edge::Kind K = E.getKind();
+
+  DEBUG_WITH_TYPE("jitlink", {
+    dbgs() << "    Applying fixup on " << G.getEdgeKindName(K)
+           << " edge, (S, A, P, .GOT.) = (" << formatv("{0:x}", S) << ", "
+           << formatv("{0:x}", A) << ", " << formatv("{0:x}", P) << ", "
+           << formatv("{0:x}", GOTBase) << ")\n";
+  });
+
+  const auto isAlignmentCorrect = [](uint64_t Value, int N) {
+    return (Value & (N - 1)) ? false : true;
+  };
+
+  switch (K) {
+  case Pointer64: {
+    uint64_t Value = S + A;
+    write64be(FixupPtr, Value);
+    break;
+  }
+  case Pointer32: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Pointer20: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isInt<20>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, (read32be(FixupPtr) & 0xF00000FF) |
+                            ((Value & 0xFFF) << 16) | ((Value & 0xFF000) >> 4));
+    break;
+  }
+  case Pointer16: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<16>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, Value);
+    break;
+  }
+  case Pointer12: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<12>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, (read16be(FixupPtr) & 0xF000) | Value);
+    break;
+  }
+  case Pointer8: {
+    uint64_t Value = S + A;
+    if (!LLVM_UNLIKELY(isUInt<8>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    *(uint8_t *)FixupPtr = Value;
+    break;
+  }
+  case Delta64:
+  case DeltaPLT64: {
+    int64_t Value = S + A - P;
+    write64be(FixupPtr, Value);
+    break;
+  }
+  case Delta32:
+  case DeltaPLT32: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta16: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<16>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, Value);
+    break;
+  }
+  case NegDelta32: {
+    int64_t Value = P + A - S;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta32dbl:
+  case DeltaPLT32dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<33>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write32be(FixupPtr, Value >> 1);
+    break;
+  }
+  case Delta24dbl:
+  case DeltaPLT24dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<25>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    FixupPtr[0] = Value >> 17;
+    FixupPtr[1] = Value >> 9;
+    FixupPtr[2] = Value >> 1;
+    break;
+  }
+  case Delta16dbl:
+  case DeltaPLT16dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<17>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write16be(FixupPtr, Value >> 1);
+    break;
+  }
+  case Delta12dbl:
+  case DeltaPLT12dbl: {
+    int64_t Value = S + A - P;
+    if (!LLVM_UNLIKELY(isInt<13>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write16be(FixupPtr,
+              (read16be(FixupPtr) & 0xF000) | ((Value >> 1) & 0x0FFF));
+    break;
+  }
+  case Delta32GOTBase: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = GOTBase + A - P;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta32dblGOTBase: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = GOTBase + A - P;
+    if (!LLVM_UNLIKELY(isInt<33>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (!LLVM_UNLIKELY(isAlignmentCorrect(Value, 2)))
+      return makeAlignmentError(FixupAddress, Value, 2, E);
+    write32be(FixupPtr, Value >> 1);
+    break;
+  }
+  case Delta64PLTFromGOT:
+  case Delta64FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = S + A - GOTBase;
+    write64be(FixupPtr, Value);
+    break;
+  }
+  case Delta32PLTFromGOT:
+  case Delta32FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = S + A - GOTBase;
+    if (!LLVM_UNLIKELY(isInt<32>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, Value);
+    break;
+  }
+  case Delta16PLTFromGOT:
+  case Delta16FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    int64_t Value = S + A - GOTBase;
+    if (!LLVM_UNLIKELY(isInt<16>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, Value);
+    break;
+  }
+  case Delta20FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    uint64_t Value = S - GOTBase + A;
+    if (!LLVM_UNLIKELY(isInt<20>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write32be(FixupPtr, (read32be(FixupPtr) & 0xF00000FF) |
+                            ((Value & 0xFFF) << 16) | ((Value & 0xFF000) >> 4));
+    break;
+  }
+  case Delta12FromGOT: {
+    assert(GOTSymbol && "No GOT section symbol");
+    uint64_t Value = S - GOTBase + A;
+    if (!LLVM_UNLIKELY(isUInt<12>(Value)))
+      return makeTargetOutOfRangeError(G, B, E);
+    write16be(FixupPtr, (read16be(FixupPtr) & 0xF000) | Value);
+    break;
+  }
+  default:
+    return make_error<JITLinkError>(
+        "In graph " + G.getName() + ", section " + B.getSection().getName() +
+        " unsupported edge kind " + getEdgeKindName(E.getKind()));
+  }
+
+  return Error::success();
+}
+
+/// SystemZ null pointer content.
+extern const char NullPointerContent[8];
+inline ArrayRef<char> getGOTEntryBlockContent(LinkGraph &G) {
+  return {reinterpret_cast<const char *>(NullPointerContent),
+          G.getPointerSize()};
+}
+
+/// SystemZ pointer jump stub content.
+///
+/// Contains the instruction sequence for an indirect jump via an in-memory
+/// pointer:
+///   lgrl %r1, ptr
+///   j    %r1
+constexpr size_t StubEntrySize = 8;
+extern const char Pointer64JumpStubContent[StubEntrySize];
+inline ArrayRef<char> getStubBlockContent(LinkGraph &G) {
+  auto StubContent = Pointer64JumpStubContent;
+  return {reinterpret_cast<const char *>(StubContent), StubEntrySize};
+}
+
+/// Creates a new pointer block in the given section and returns an
+/// Anonymous symbol pointing to it.
+///
+/// If InitialTarget is given then an Pointer64 relocation will be added to the
+/// block pointing at InitialTarget.
+inline Symbol &createAnonymousPointer(LinkGraph &G, Section &PointerSection,
+                                      Symbol *InitialTarget = nullptr,
+                                      uint64_t InitialAddend = 0) {
+  auto &B = G.createContentBlock(PointerSection, getGOTEntryBlockContent(G),
+                                 orc::ExecutorAddr(), G.getPointerSize(), 0);
+  if (InitialTarget)
+    B.addEdge(Pointer64, 0, *InitialTarget, InitialAddend);
+  return G.addAnonymousSymbol(B, 0, G.getPointerSize(), false, false);
+}
+
+/// Create a jump stub block that jumps via the pointer at the given symbol.
+///
+/// The stub block will have the following default values:
+///   alignment: 16-bit
+///   alignment-offset: 0
+inline Block &createPointerJumpStubBlock(LinkGraph &G, Section &StubSection,
+                                         Symbol &PointerSymbol) {
+  auto &B = G.createContentBlock(StubSection, getStubBlockContent(G),
+                                 orc::ExecutorAddr(), 16, 0);
+  B.addEdge(Delta32dbl, 2, PointerSymbol, 2);
+  return B;
+}
+
+/// Create a jump stub that jumps via the pointer at the given symbol and
+/// an anonymous symbol pointing to it. Return the anonymous symbol.
+///
+/// The stub block will be created by createPointerJumpStubBlock.
+inline Symbol &createAnonymousPointerJumpStub(LinkGraph &G,
+                                              Section &StubSection,
+                                              Symbol &PointerSymbol) {
+  return G.addAnonymousSymbol(
+      createPointerJumpStubBlock(G, StubSection, PointerSymbol), 0,
+      StubEntrySize, true, false);
+}
+
+/// Global Offset Table Builder.
+class GOTTableManager : public TableManager<GOTTableManager> {
+public:
+  static StringRef getSectionName() { return "$__GOT"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getTarget().isDefined())
+      return false;
+    Edge::Kind KindToSet = Edge::Invalid;
+    switch (E.getKind()) {
+    case systemz::RequestGOTAndTransformToDelta12FromGOT:
+      KindToSet = systemz::Delta12FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta16FromGOT:
+      KindToSet = systemz::Delta16FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta20FromGOT:
+      KindToSet = systemz::Delta20FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta32FromGOT:
+      KindToSet = systemz::Delta32FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta64FromGOT:
+      KindToSet = systemz::Delta64FromGOT;
+      break;
+    case systemz::RequestGOTAndTransformToDelta32dbl:
+      KindToSet = systemz::DeltaPLT32dbl;
+      break;
+    default:
+      return false;
+    }
+    assert(KindToSet != Edge::Invalid &&
+           "Fell through switch, but no new kind to set");
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+             << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+             << formatv("{0:x}", E.getOffset()) << ")\n";
+    });
+    E.setKind(KindToSet);
+    E.setTarget(getEntryForTarget(G, E.getTarget()));
+    return true;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    return createAnonymousPointer(G, getGOTSection(G), &Target);
+  }
+
+private:
+  Section &getGOTSection(LinkGraph &G) {
+    if (!GOTSection)
+      GOTSection = &G.createSection(getSectionName(),
+                                    orc::MemProt::Read | orc::MemProt::Exec);
+    return *GOTSection;
+  }
+
+  Section *GOTSection = nullptr;
+};
+
+/// Procedure Linkage Table Builder.
+class PLTTableManager : public TableManager<PLTTableManager> {
+public:
+  PLTTableManager(GOTTableManager &GOT) : GOT(GOT) {}
+
+  static StringRef getSectionName() { return "$__STUBS"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
+    if (E.getTarget().isDefined())
+      return false;
+
+    switch (E.getKind()) {
+    case systemz::DeltaPLT32:
+    case systemz::DeltaPLT64:
+    case systemz::DeltaPLT12dbl:
+    case systemz::DeltaPLT16dbl:
+    case systemz::DeltaPLT24dbl:
+    case systemz::DeltaPLT32dbl:
+    case systemz::Delta16PLTFromGOT:
+    case systemz::Delta32PLTFromGOT:
+    case systemz::Delta64PLTFromGOT:
+      break;
+    default:
+      return false;
+    }
+    DEBUG_WITH_TYPE("jitlink", {
+      dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
+             << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
+             << formatv("{0:x}", E.getOffset()) << ")\n";
+    });
+    E.setTarget(getEntryForTarget(G, E.getTarget()));
+    return true;
+  }
+
+  Symbol &createEntry(LinkGraph &G, Symbol &Target) {
+    return createAnonymousPointerJumpStub(G, getStubsSection(G),
+                                          GOT.getEntryForTarget(G, Target));
+  }
+
+public:
+  Section &getStubsSection(LinkGraph &G) {
+    if (!StubsSection)
+      StubsSection = &G.createSection(getSectionName(),
+                                      orc::MemProt::Read | orc::MemProt::Exec);
+    return *StubsSection;
+  }
+
+  GOTTableManager &GOT;
+  Section *StubsSection = nullptr;
+};
+
+} // namespace systemz
+} // namespace jitlink
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_JITLINK_SYSTEMZ_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index fc01afc6d8739..01e9cf914cb54 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -43,7 +43,6 @@
 #include <algorithm>
 #include <cassert>
 #include <functional>
-#include <iterator>
 #include <memory>
 #include <utility>
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
index 517089341978a..81c6a0b01530a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
@@ -12,7 +12,6 @@
 #include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
 
 #include <cmath>
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
index ef0fed4f41556..e6058612de4b7 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -20,7 +20,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <mutex>
-#include <type_traits>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 12dfb6c607bb9..67ebafc89cf99 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -241,7 +241,7 @@ ENUM(MotionExpectation, Present);
 // V5.2: [15.9.1] `task-dependence-type` modifier
 ENUM(DependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink,
      Source);
-ENUM(Prescriptiveness, Strict, Fallback);
+ENUM(Prescriptiveness, Strict);
 
 template <typename I, typename E> //
 struct LoopIterationT {
@@ -591,10 +591,10 @@ struct DynamicAllocatorsT {
 template <typename T, typename I, typename E> //
 struct DynGroupprivateT {
   ENUM(AccessGroup, Cgroup);
-  using Prescriptiveness = type::Prescriptiveness;
+  ENUM(Fallback, Abort, Default_Mem, Null);
   using Size = E;
   using TupleTrait = std::true_type;
-  std::tuple<OPT(AccessGroup), OPT(Prescriptiveness), Size> t;
+  std::tuple<OPT(AccessGroup), OPT(Fallback), Size> t;
 };
 
 // V5.2: [5.8.4] `enter` clause
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 6d6eb5cda52de..d702273cec9ec 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -207,40 +207,40 @@ struct ConstructDecompositionT {
   template <typename Clause>
   bool applyClause(Clause &&clause, const ClauseTy *node);
 
+  bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
   bool applyClause(const tomp::clause::CollapseT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool
   applyClause(const tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
+  bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
   bool
   applyClause(const tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
-  bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool
-  applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
+  applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
+  bool applyClause(const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
+                   const ClauseTy *);
   bool applyClause(const tomp::clause::OrderT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool applyClause(const tomp::clause::ReductionT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
-  bool applyClause(const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
-                   const ClauseTy *);
-  bool applyClause(const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
-                   const ClauseTy *);
-  bool applyClause(const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
+  bool applyClause(const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
                    const ClauseTy *);
   bool
-  applyClause(const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+  applyClause(const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
               const ClauseTy *);
-  bool applyClause(const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
-                   const ClauseTy *);
 
   uint32_t version;
   llvm::omp::Directive construct;
@@ -458,6 +458,34 @@ bool ConstructDecompositionT<C, H>::applyClause(Specific &&specific,
   return false;
 }
 
+// --- Specific clauses -----------------------------------------------
+
+// ALLOCATE
+// [5.2:178:7-9]
+// Directives: allocators, distribute, do, for, parallel, scope, sections,
+// single, target, task, taskgroup, taskloop, teams
+//
+// [5.2:340:33-35]
+// (33) The effect of the allocate clause is as if it is applied to all leaf
+// constructs that permit the clause and to which a data-sharing attribute
+// clause that may create a private copy of the same list item is applied.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  // This one needs to be applied at the end, once we know which clauses are
+  // assigned to which leaf constructs.
+
+  // [5.2:340:33]
+  bool applied = applyIf(node, [&](const auto &leaf) {
+    return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
+      return llvm::omp::isPrivatizingClause(n->id);
+    });
+  });
+
+  return applied;
+}
+
 // COLLAPSE
 // [5.2:93:20-21]
 // Directives: distribute, do, for, loop, simd, taskloop
@@ -483,19 +511,19 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return false;
 }
 
-// PRIVATE
-// [5.2:111:5-7]
-// Directives: distribute, do, for, loop, parallel, scope, sections, simd,
-// single, target, task, taskloop, teams
+// DEFAULT
+// [5.2:109:5-6]
+// Directives: parallel, task, taskloop, teams
 //
-// [5.2:340:1-2]
-// (1) The effect of the 1 private clause is as if it is applied only to the
-// innermost leaf construct that permits it.
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  return applyToInnermost(node);
+  // [5.2:340:31]
+  return applyToAll(node);
 }
 
 // FIRSTPRIVATE
@@ -626,6 +654,44 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return applied;
 }
 
+// IF
+// [5.2:72:7-9]
+// Directives: cancel, parallel, simd, target, target data, target enter data,
+// target exit data, target update, task, taskloop
+//
+// [5.2:72:15-18]
+// (15) For combined or composite constructs, the if clause only applies to the
+// semantics of the construct named in the directive-name-modifier.
+// (16) For a combined or composite construct, if no directive-name-modifier is
+// specified then the if clause applies to all constituent constructs to which
+// an if clause can apply.
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  using DirectiveNameModifier =
+      typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier;
+  using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression;
+  auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t);
+
+  if (modifier) {
+    llvm::omp::Directive dirId = *modifier;
+    auto *unmodified =
+        makeClause(llvm::omp::Clause::OMPC_if,
+                   tomp::clause::IfT<TypeTy, IdTy, ExprTy>{
+                       {/*DirectiveNameModifier=*/std::nullopt,
+                        /*IfExpression=*/std::get<IfExpression>(clause.t)}});
+
+    if (auto *hasDir = findDirective(dirId)) {
+      hasDir->clauses.push_back(unmodified);
+      return true;
+    }
+    return false;
+  }
+
+  return applyToAll(node);
+}
+
 // LASTPRIVATE
 // [5.2:115:7-8]
 // Directives: distribute, do, for, loop, sections, simd, taskloop
@@ -720,51 +786,94 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return applied;
 }
 
-// SHARED
-// [5.2:110:5-6]
-// Directives: parallel, task, taskloop, teams
+// LINEAR
+// [5.2:118:1-2]
+// Directives: declare simd, do, for, simd
 //
-// [5.2:340:31-32]
-// (31) The effect of the shared, default, thread_limit, or order clause is as
-// if it is applied to all leaf constructs that permit the clause.
+// [5.2:341:15-22]
+// (15.1) The effect of the linear clause is as if it is applied to the
+// innermost leaf construct.
+// (15.2) Additionally, if the list item is not the iteration variable of a simd
+// or worksharing-loop SIMD construct, the effect on the outer leaf constructs
+// is as if the list item was specified in firstprivate and lastprivate clauses
+// on the combined or composite construct, with the rules specified above
+// applied.
+// (19) If a list item of the linear clause is the iteration variable of a simd
+// or worksharing-loop SIMD construct and it is not declared in the construct,
+// the effect on the outer leaf constructs is as if the list item was specified
+// in a lastprivate clause on the combined or composite construct with the rules
+// specified above applied.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:340:31]
-  return applyToAll(node);
+  // [5.2:341:15.1]
+  if (!applyToInnermost(node))
+    return false;
+
+  // [5.2:341:15.2], [5.2:341:19]
+  auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd);
+  std::optional<ObjectTy> iterVar = helper.getLoopIterVar();
+  const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
+
+  // Lists of objects that will be used to construct "firstprivate" and
+  // "lastprivate" clauses.
+  tomp::ObjectListT<IdTy, ExprTy> first, last;
+
+  for (const ObjectTy &object : objects) {
+    last.push_back(object);
+    if (!dirSimd || !iterVar || object.id() != iterVar->id())
+      first.push_back(object);
+  }
+
+  if (!first.empty()) {
+    auto *firstp = makeClause(
+        llvm::omp::Clause::OMPC_firstprivate,
+        tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first});
+    nodes.push_back(firstp); // Appending to the main clause list.
+  }
+  if (!last.empty()) {
+    auto *lastp =
+        makeClause(llvm::omp::Clause::OMPC_lastprivate,
+                   tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{
+                       {/*LastprivateModifier=*/std::nullopt, /*List=*/last}});
+    nodes.push_back(lastp); // Appending to the main clause list.
+  }
+  return true;
 }
 
-// DEFAULT
-// [5.2:109:5-6]
-// Directives: parallel, task, taskloop, teams
+// NOWAIT
+// [5.2:308:11-13]
+// Directives: dispatch, do, for, interop, scope, sections, single, target,
+// target enter data, target exit data, target update, taskwait, workshare
 //
-// [5.2:340:31-32]
-// (31) The effect of the shared, default, thread_limit, or order clause is as
-// if it is applied to all leaf constructs that permit the clause.
+// [5.2:341:23]
+// (23) The effect of the nowait clause is as if it is applied to the outermost
+// leaf construct that permits it.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::DefaultT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:340:31]
-  return applyToAll(node);
+  return applyToOutermost(node);
 }
 
-// THREAD_LIMIT
-// [5.2:277:14-15]
-// Directives: target, teams
-//
-// [5.2:340:31-32]
-// (31) The effect of the shared, default, thread_limit, or order clause is as
-// if it is applied to all leaf constructs that permit the clause.
+// OMPX_ATTRIBUTE
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // [5.2:340:31]
+  // ERROR: no leaf that allows clause
   return applyToAll(node);
 }
 
+// OMPX_BARE
+template <typename C, typename H>
+bool ConstructDecompositionT<C, H>::applyClause(
+    const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
+    const ClauseTy *node) {
+  return applyToOutermost(node);
+}
+
 // ORDER
 // [5.2:234:3-4]
 // Directives: distribute, do, for, loop, simd
@@ -780,30 +889,19 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return applyToAll(node);
 }
 
-// ALLOCATE
-// [5.2:178:7-9]
-// Directives: allocators, distribute, do, for, parallel, scope, sections,
-// single, target, task, taskgroup, taskloop, teams
+// PRIVATE
+// [5.2:111:5-7]
+// Directives: distribute, do, for, loop, parallel, scope, sections, simd,
+// single, target, task, taskloop, teams
 //
-// [5.2:340:33-35]
-// (33) The effect of the allocate clause is as if it is applied to all leaf
-// constructs that permit the clause and to which a data-sharing attribute
-// clause that may create a private copy of the same list item is applied.
+// [5.2:340:1-2]
+// (1) The effect of the 1 private clause is as if it is applied only to the
+// innermost leaf construct that permits it.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::AllocateT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::PrivateT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  // This one needs to be applied at the end, once we know which clauses are
-  // assigned to which leaf constructs.
-
-  // [5.2:340:33]
-  bool applied = applyIf(node, [&](const auto &leaf) {
-    return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
-      return llvm::omp::isPrivatizingClause(n->id);
-    });
-  });
-
-  return applied;
+  return applyToInnermost(node);
 }
 
 // REDUCTION
@@ -983,129 +1081,38 @@ bool ConstructDecompositionT<C, H>::applyClause(
   return applied;
 }
 
-// IF
-// [5.2:72:7-9]
-// Directives: cancel, parallel, simd, target, target data, target enter data,
-// target exit data, target update, task, taskloop
+// SHARED
+// [5.2:110:5-6]
+// Directives: parallel, task, taskloop, teams
 //
-// [5.2:72:15-18]
-// (15) For combined or composite constructs, the if clause only applies to the
-// semantics of the construct named in the directive-name-modifier.
-// (16) For a combined or composite construct, if no directive-name-modifier is
-// specified then the if clause applies to all constituent constructs to which
-// an if clause can apply.
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::IfT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::SharedT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
-  using DirectiveNameModifier =
-      typename clause::IfT<TypeTy, IdTy, ExprTy>::DirectiveNameModifier;
-  using IfExpression = typename clause::IfT<TypeTy, IdTy, ExprTy>::IfExpression;
-  auto &modifier = std::get<std::optional<DirectiveNameModifier>>(clause.t);
-
-  if (modifier) {
-    llvm::omp::Directive dirId = *modifier;
-    auto *unmodified =
-        makeClause(llvm::omp::Clause::OMPC_if,
-                   tomp::clause::IfT<TypeTy, IdTy, ExprTy>{
-                       {/*DirectiveNameModifier=*/std::nullopt,
-                        /*IfExpression=*/std::get<IfExpression>(clause.t)}});
-
-    if (auto *hasDir = findDirective(dirId)) {
-      hasDir->clauses.push_back(unmodified);
-      return true;
-    }
-    return false;
-  }
-
+  // [5.2:340:31]
   return applyToAll(node);
 }
 
-// LINEAR
-// [5.2:118:1-2]
-// Directives: declare simd, do, for, simd
-//
-// [5.2:341:15-22]
-// (15.1) The effect of the linear clause is as if it is applied to the
-// innermost leaf construct.
-// (15.2) Additionally, if the list item is not the iteration variable of a simd
-// or worksharing-loop SIMD construct, the effect on the outer leaf constructs
-// is as if the list item was specified in firstprivate and lastprivate clauses
-// on the combined or composite construct, with the rules specified above
-// applied.
-// (19) If a list item of the linear clause is the iteration variable of a simd
-// or worksharing-loop SIMD construct and it is not declared in the construct,
-// the effect on the outer leaf constructs is as if the list item was specified
-// in a lastprivate clause on the combined or composite construct with the rules
-// specified above applied.
-template <typename C, typename H>
-bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::LinearT<TypeTy, IdTy, ExprTy> &clause,
-    const ClauseTy *node) {
-  // [5.2:341:15.1]
-  if (!applyToInnermost(node))
-    return false;
-
-  // [5.2:341:15.2], [5.2:341:19]
-  auto dirSimd = findDirective(llvm::omp::Directive::OMPD_simd);
-  std::optional<ObjectTy> iterVar = helper.getLoopIterVar();
-  const auto &objects = std::get<tomp::ObjectListT<IdTy, ExprTy>>(clause.t);
-
-  // Lists of objects that will be used to construct "firstprivate" and
-  // "lastprivate" clauses.
-  tomp::ObjectListT<IdTy, ExprTy> first, last;
-
-  for (const ObjectTy &object : objects) {
-    last.push_back(object);
-    if (!dirSimd || !iterVar || object.id() != iterVar->id())
-      first.push_back(object);
-  }
-
-  if (!first.empty()) {
-    auto *firstp = makeClause(
-        llvm::omp::Clause::OMPC_firstprivate,
-        tomp::clause::FirstprivateT<TypeTy, IdTy, ExprTy>{/*List=*/first});
-    nodes.push_back(firstp); // Appending to the main clause list.
-  }
-  if (!last.empty()) {
-    auto *lastp =
-        makeClause(llvm::omp::Clause::OMPC_lastprivate,
-                   tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>{
-                       {/*LastprivateModifier=*/std::nullopt, /*List=*/last}});
-    nodes.push_back(lastp); // Appending to the main clause list.
-  }
-  return true;
-}
-
-// NOWAIT
-// [5.2:308:11-13]
-// Directives: dispatch, do, for, interop, scope, sections, single, target,
-// target enter data, target exit data, target update, taskwait, workshare
+// THREAD_LIMIT
+// [5.2:277:14-15]
+// Directives: target, teams
 //
-// [5.2:341:23]
-// (23) The effect of the nowait clause is as if it is applied to the outermost
-// leaf construct that permits it.
-template <typename C, typename H>
-bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::NowaitT<TypeTy, IdTy, ExprTy> &clause,
-    const ClauseTy *node) {
-  return applyToOutermost(node);
-}
-
-template <typename C, typename H>
-bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::OmpxBareT<TypeTy, IdTy, ExprTy> &clause,
-    const ClauseTy *node) {
-  return applyToOutermost(node);
-}
-
+// [5.2:340:31-32]
+// (31) The effect of the shared, default, thread_limit, or order clause is as
+// if it is applied to all leaf constructs that permit the clause.
 template <typename C, typename H>
 bool ConstructDecompositionT<C, H>::applyClause(
-    const tomp::clause::OmpxAttributeT<TypeTy, IdTy, ExprTy> &clause,
+    const tomp::clause::ThreadLimitT<TypeTy, IdTy, ExprTy> &clause,
     const ClauseTy *node) {
+  // [5.2:340:31]
   return applyToAll(node);
 }
 
+// --- Splitting ------------------------------------------------------
+
 template <typename C, typename H> bool ConstructDecompositionT<C, H>::split() {
   bool success = true;
 
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index 1c86d181e4375..8f6fb4da0c839 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -28,7 +28,6 @@
 #include "llvm/Support/TypeSize.h"
 #include <cstdint>
 #include <functional>
-#include <iterator>
 #include <optional>
 #include <string>
 #include <utility>
diff --git a/llvm/include/llvm/IR/PassInstrumentation.h b/llvm/include/llvm/IR/PassInstrumentation.h
index 33eda5a4222f1..c25e2891d33d5 100644
--- a/llvm/include/llvm/IR/PassInstrumentation.h
+++ b/llvm/include/llvm/IR/PassInstrumentation.h
@@ -55,7 +55,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index acb17a8090c51..4354551a2405b 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -47,7 +47,6 @@
 #include "llvm/Support/TypeName.h"
 #include <cassert>
 #include <cstring>
-#include <iterator>
 #include <list>
 #include <memory>
 #include <tuple>
diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h
index 11d32fbb64702..c514b768637d1 100644
--- a/llvm/include/llvm/MC/MCParser/AsmLexer.h
+++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h
@@ -21,7 +21,6 @@
 #include <cassert>
 #include <cstddef>
 #include <string>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index bd7cd39ebb743..03777c7fcb45f 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -128,7 +128,6 @@
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include <cassert>
-#include <type_traits>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/ProfileData/HashKeyMap.h b/llvm/include/llvm/ProfileData/HashKeyMap.h
index b2f1bf222157b..fceb95143340f 100644
--- a/llvm/include/llvm/ProfileData/HashKeyMap.h
+++ b/llvm/include/llvm/ProfileData/HashKeyMap.h
@@ -16,7 +16,6 @@
 #define LLVM_PROFILEDATA_HASHKEYMAP_H
 
 #include "llvm/ADT/Hashing.h"
-#include <iterator>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Support/CFGDiff.h b/llvm/include/llvm/Support/CFGDiff.h
index 41004d755a124..88f4fe52d2019 100644
--- a/llvm/include/llvm/Support/CFGDiff.h
+++ b/llvm/include/llvm/Support/CFGDiff.h
@@ -21,7 +21,6 @@
 #include "llvm/Support/type_traits.h"
 #include <cassert>
 #include <cstddef>
-#include <iterator>
 
 // Two booleans are used to define orders in graphs:
 // InverseGraph defines when we need to reverse the whole graph and is as such
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index af542bae9f8c6..b6aae9f7928e3 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -35,7 +35,6 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
-#include <iterator>
 #include <memory>
 #include <type_traits>
 #include <utility>
diff --git a/llvm/include/llvm/Support/type_traits.h b/llvm/include/llvm/Support/type_traits.h
index a96125c16f11b..d037132fa5bad 100644
--- a/llvm/include/llvm/Support/type_traits.h
+++ b/llvm/include/llvm/Support/type_traits.h
@@ -15,7 +15,6 @@
 
 #include "llvm/Support/Compiler.h"
 #include <type_traits>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h
index b20c7e2ec07d2..9b607f1a3a8fc 100644
--- a/llvm/include/llvm/Telemetry/Telemetry.h
+++ b/llvm/include/llvm/Telemetry/Telemetry.h
@@ -22,7 +22,6 @@
 #include <memory>
 #include <optional>
 #include <string>
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h
index 6bf4f1d1ae082..8de569827586c 100644
--- a/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -12,8 +12,6 @@
 #ifndef LLVM_XRAY_YAMLXRAYRECORD_H
 #define LLVM_XRAY_YAMLXRAYRECORD_H
 
-#include <type_traits>
-
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/XRay/XRayRecord.h"
 
diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp
index 61b7b3fa9e2c4..7fe00c6e22c51 100644
--- a/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/llvm/lib/Analysis/AssumptionCache.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index f2ada27cac01d..a3cd157e6aa61 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -17,8 +17,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 
-#include <utility>
-
 namespace llvm {
 namespace AMDGPU {
 namespace HSAMD {
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 46b5bb7908227..060582cec74d8 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index 11ca48d9fe05c..bb55fc77fca0f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -12,7 +12,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 98cdada3d8add..aff6a76879062 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -26,7 +26,6 @@
 #include <cassert>
 #include <map>
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 8931daf9bb3de..7716dfc21d475 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -20,7 +20,6 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include <cassert>
 #include <cstdint>
-#include <iterator>
 #include <optional>
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index d5dac417756f0..d304c7efe2a75 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -13,7 +13,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 3d71a9c7b0b9e..9dfb6af58323a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -32,7 +32,6 @@
 #include <cstdint>
 #include <limits>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/LiveRangeCalc.cpp b/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 149f93fa69ccb..0260ee2e75aa5 100644
--- a/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -28,7 +28,6 @@
 #include <cassert>
 #include <iterator>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index ea08365810a29..187bff78f236f 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -937,16 +937,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     if (CopyOperands) {
       Register RegSrc = CopyOperands->Source->getReg();
       Register RegDef = CopyOperands->Destination->getReg();
-      // It's possible that the previous transformations have resulted in a
-      // no-op register move (i.e. one where source and destination registers
-      // are the same and are not referring to a reserved register). If so,
-      // delete it.
-      if (RegSrc == RegDef && !MRI->isReserved(RegSrc)) {
-        MI.eraseFromParent();
-        NumDeletes++;
-        Changed = true;
-        continue;
-      }
 
       if (!TRI->regsOverlap(RegDef, RegSrc)) {
         // Copy is now a candidate for deletion.
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 9ac3f7411af35..c40bd1c83f34a 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -31,7 +31,6 @@
 #include <algorithm>
 #include <cassert>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 6b747f343c268..782898f430c19 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -69,7 +69,6 @@
 #include <cstdint>
 #include <optional>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::safestack;
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 2a8234a37a167..5fd5d6cce23df 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -49,7 +49,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 814b4b57a0b9b..01216552ed260 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -59,7 +59,6 @@
 #include <cassert>
 #include <cstdint>
 #include <cstring>
-#include <iterator>
 #include <string>
 #include <tuple>
 #include <utility>
diff --git a/llvm/lib/DWARFCFIChecker/Registers.h b/llvm/lib/DWARFCFIChecker/Registers.h
index a372c4c4345bd..915250de5aeae 100644
--- a/llvm/lib/DWARFCFIChecker/Registers.h
+++ b/llvm/lib/DWARFCFIChecker/Registers.h
@@ -17,7 +17,6 @@
 
 #include "llvm/MC/MCRegister.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include <iterator>
 
 namespace llvm {
 
diff --git a/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
index 1898fba004e88..c437c53b0481a 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 44fe0a990e8c0..545896ff493c6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -31,7 +31,6 @@
 #include <cinttypes>
 #include <cstdint>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index a201fae84838c..db6170c784f80 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -17,7 +17,6 @@
 #include <cinttypes>
 #include <cstdint>
 #include <set>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 87675be1fc8e1..9fe74898170a5 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -41,7 +41,6 @@
 #include <map>
 #include <mutex>
 #include <string>
-#include <utility>
 #include <vector>
 
 #ifdef HAVE_FFI_CALL
diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
index 4669124ebe578..0b530fb1bc478 100644
--- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_component_library(LLVMJITLink
   ELF_loongarch.cpp
   ELF_ppc64.cpp
   ELF_riscv.cpp
+  ELF_systemz.cpp
   ELF_x86.cpp
   ELF_x86_64.cpp
 
@@ -46,6 +47,7 @@ add_llvm_component_library(LLVMJITLink
   loongarch.cpp
   ppc64.cpp
   riscv.cpp
+  systemz.cpp
   x86.cpp
   x86_64.cpp
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
index 87e451715811f..42f42eef00e5b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ExecutionEngine/JITLink/ELF_loongarch.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_ppc64.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_systemz.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86.h"
 #include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
 #include "llvm/Object/ELF.h"
@@ -98,6 +99,8 @@ createLinkGraphFromELFObject(MemoryBufferRef ObjectBuffer,
     return createLinkGraphFromELFObject_loongarch(ObjectBuffer, std::move(SSP));
   case ELF::EM_RISCV:
     return createLinkGraphFromELFObject_riscv(ObjectBuffer, std::move(SSP));
+  case ELF::EM_S390:
+    return createLinkGraphFromELFObject_systemz(ObjectBuffer, std::move(SSP));
   case ELF::EM_X86_64:
     return createLinkGraphFromELFObject_x86_64(ObjectBuffer, std::move(SSP));
   case ELF::EM_386:
@@ -135,6 +138,9 @@ void link_ELF(std::unique_ptr<LinkGraph> G,
   case Triple::riscv64:
     link_ELF_riscv(std::move(G), std::move(Ctx));
     return;
+  case Triple::systemz:
+    link_ELF_systemz(std::move(G), std::move(Ctx));
+    return;
   case Triple::x86_64:
     link_ELF_x86_64(std::move(G), std::move(Ctx));
     return;
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
new file mode 100644
index 0000000000000..29eeecceea766
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_systemz.cpp
@@ -0,0 +1,424 @@
+//===----- ELF_systemz.cpp - JIT linker implementation for ELF/systemz ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF/systemz jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
+#include "llvm/ExecutionEngine/JITLink/systemz.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#include "DefineExternalSectionStartAndEndSymbols.h"
+#include "EHFrameSupportImpl.h"
+#include "ELFLinkGraphBuilder.h"
+#include "JITLinkGeneric.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+
+namespace {
+
+constexpr StringRef ELFGOTSymbolName = "_GLOBAL_OFFSET_TABLE_";
+
+Error buildTables_ELF_systemz(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Visiting edges in graph:\n");
+  systemz::GOTTableManager GOT;
+  systemz::PLTTableManager PLT(GOT);
+  visitExistingEdges(G, GOT, PLT);
+  return Error::success();
+}
+
+} // namespace
+
+namespace llvm {
+namespace jitlink {
+class ELFJITLinker_systemz : public JITLinker<ELFJITLinker_systemz> {
+  friend class JITLinker<ELFJITLinker_systemz>;
+
+public:
+  ELFJITLinker_systemz(std::unique_ptr<JITLinkContext> Ctx,
+                       std::unique_ptr<LinkGraph> G,
+                       PassConfiguration PassConfig)
+      : JITLinker(std::move(Ctx), std::move(G), std::move(PassConfig)) {
+    if (shouldAddDefaultTargetPasses(getGraph().getTargetTriple()))
+      getPassConfig().PostAllocationPasses.push_back(
+          [this](LinkGraph &G) { return getOrCreateGOTSymbol(G); });
+  }
+
+private:
+  Symbol *GOTSymbol = nullptr;
+
+  Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const {
+    return systemz::applyFixup(G, B, E, GOTSymbol);
+  }
+
+  Error getOrCreateGOTSymbol(LinkGraph &G) {
+    auto DefineExternalGOTSymbolIfPresent =
+        createDefineExternalSectionStartAndEndSymbolsPass(
+            [&](LinkGraph &LG, Symbol &Sym) -> SectionRangeSymbolDesc {
+              if (Sym.getName() != nullptr &&
+                  *Sym.getName() == ELFGOTSymbolName)
+                if (auto *GOTSection = G.findSectionByName(
+                        systemz::GOTTableManager::getSectionName())) {
+                  GOTSymbol = &Sym;
+                  return {*GOTSection, true};
+                }
+              return {};
+            });
+
+    // Try to attach _GLOBAL_OFFSET_TABLE_ to the GOT if it's defined as an
+    // external.
+    if (auto Err = DefineExternalGOTSymbolIfPresent(G))
+      return Err;
+
+    // If we succeeded then we're done.
+    if (GOTSymbol)
+      return Error::success();
+
+    // Otherwise look for a GOT section: If it already has a start symbol we'll
+    // record it, otherwise we'll create our own.
+    // If there's a GOT section but we didn't find an external GOT symbol...
+    if (auto *GOTSection =
+            G.findSectionByName(systemz::GOTTableManager::getSectionName())) {
+
+      // Check for an existing defined symbol.
+      for (auto *Sym : GOTSection->symbols())
+        if (Sym->getName() != nullptr && *Sym->getName() == ELFGOTSymbolName) {
+          GOTSymbol = Sym;
+          return Error::success();
+        }
+
+      // If there's no defined symbol then create one.
+      SectionRange SR(*GOTSection);
+      if (SR.empty())
+        GOTSymbol =
+            &G.addAbsoluteSymbol(ELFGOTSymbolName, orc::ExecutorAddr(), 0,
+                                 Linkage::Strong, Scope::Local, true);
+      else
+        GOTSymbol =
+            &G.addDefinedSymbol(*SR.getFirstBlock(), 0, ELFGOTSymbolName, 0,
+                                Linkage::Strong, Scope::Local, false, true);
+    }
+
+    // If we still haven't found a GOT symbol then double check the externals.
+    // We may have a GOT-relative reference but no GOT section, in which case
+    // we just need to point the GOT symbol at some address in this graph.
+    if (!GOTSymbol) {
+      for (auto *Sym : G.external_symbols()) {
+        if (Sym->getName() != nullptr && *Sym->getName() == ELFGOTSymbolName) {
+          auto Blocks = G.blocks();
+          if (!Blocks.empty()) {
+            G.makeAbsolute(*Sym, (*Blocks.begin())->getAddress());
+            GOTSymbol = Sym;
+            break;
+          }
+        }
+      }
+    }
+
+    return Error::success();
+  }
+};
+
+class ELFLinkGraphBuilder_systemz
+    : public ELFLinkGraphBuilder<object::ELF64BE> {
+private:
+  using ELFT = object::ELF64BE;
+  using Base = ELFLinkGraphBuilder<ELFT>;
+  using Base::G; // Use LinkGraph pointer from base class.
+
+  Error addRelocations() override {
+    LLVM_DEBUG(dbgs() << "Processing relocations:\n");
+
+    using Base = ELFLinkGraphBuilder<ELFT>;
+    using Self = ELFLinkGraphBuilder_systemz;
+    for (const auto &RelSect : Base::Sections) {
+      if (RelSect.sh_type == ELF::SHT_REL)
+        // Validate the section to read relocation entries from.
+        return make_error<StringError>("No SHT_REL in valid " +
+                                           G->getTargetTriple().getArchName() +
+                                           " ELF object files",
+                                       inconvertibleErrorCode());
+
+      if (Error Err = Base::forEachRelaRelocation(RelSect, this,
+                                                  &Self::addSingleRelocation))
+        return Err;
+    }
+
+    return Error::success();
+  }
+
+  Error addSingleRelocation(const typename ELFT::Rela &Rel,
+                            const typename ELFT::Shdr &FixupSect,
+                            Block &BlockToFix) {
+    using support::big32_t;
+    using Base = ELFLinkGraphBuilder<ELFT>;
+    auto ELFReloc = Rel.getType(false);
+
+    // No reloc.
+    if (LLVM_UNLIKELY(ELFReloc == ELF::R_390_NONE))
+      return Error::success();
+
+    uint32_t SymbolIndex = Rel.getSymbol(false);
+    auto ObjSymbol = Base::Obj.getRelocationSymbol(Rel, Base::SymTabSec);
+    if (!ObjSymbol)
+      return ObjSymbol.takeError();
+
+    Symbol *GraphSymbol = Base::getGraphSymbol(SymbolIndex);
+    if (!GraphSymbol)
+      return make_error<StringError>(
+          formatv("Could not find symbol at given index, did you add it to "
+                  "JITSymbolTable? index: {0}, shndx: {1} Size of table: {2}",
+                  SymbolIndex, (*ObjSymbol)->st_shndx,
+                  Base::GraphSymbols.size()),
+          inconvertibleErrorCode());
+
+    // Validate the relocation kind.
+    int64_t Addend = Rel.r_addend;
+    Edge::Kind Kind = Edge::Invalid;
+
+    switch (ELFReloc) {
+    case ELF::R_390_PC64: {
+      Kind = systemz::Delta64;
+      break;
+    }
+    case ELF::R_390_PC32: {
+      Kind = systemz::Delta32;
+      break;
+    }
+    case ELF::R_390_PC16: {
+      Kind = systemz::Delta16;
+      break;
+    }
+    case ELF::R_390_PC32DBL: {
+      Kind = systemz::Delta32dbl;
+      break;
+    }
+    case ELF::R_390_PC24DBL: {
+      Kind = systemz::Delta24dbl;
+      break;
+    }
+    case ELF::R_390_PC16DBL: {
+      Kind = systemz::Delta16dbl;
+      break;
+    }
+    case ELF::R_390_PC12DBL: {
+      Kind = systemz::Delta12dbl;
+      break;
+    }
+    case ELF::R_390_64: {
+      Kind = systemz::Pointer64;
+      break;
+    }
+    case ELF::R_390_32: {
+      Kind = systemz::Pointer32;
+      break;
+    }
+    case ELF::R_390_20: {
+      Kind = systemz::Pointer20;
+      break;
+    }
+    case ELF::R_390_16: {
+      Kind = systemz::Pointer16;
+      break;
+    }
+    case ELF::R_390_12: {
+      Kind = systemz::Pointer12;
+      break;
+    }
+    case ELF::R_390_8: {
+      Kind = systemz::Pointer8;
+      break;
+    }
+    // Relocations targeting the PLT associated with the symbol.
+    case ELF::R_390_PLT64: {
+      Kind = systemz::DeltaPLT64;
+      break;
+    }
+    case ELF::R_390_PLT32: {
+      Kind = systemz::DeltaPLT32;
+      break;
+    }
+    case ELF::R_390_PLT32DBL: {
+      Kind = systemz::DeltaPLT32dbl;
+      break;
+    }
+    case ELF::R_390_PLT24DBL: {
+      Kind = systemz::DeltaPLT24dbl;
+      break;
+    }
+    case ELF::R_390_PLT16DBL: {
+      Kind = systemz::DeltaPLT16dbl;
+      break;
+    }
+    case ELF::R_390_PLT12DBL: {
+      Kind = systemz::DeltaPLT12dbl;
+      break;
+    }
+    case ELF::R_390_PLTOFF64: {
+      Kind = systemz::Delta64PLTFromGOT;
+      break;
+    }
+    case ELF::R_390_PLTOFF32: {
+      Kind = systemz::Delta32PLTFromGOT;
+      break;
+    }
+    case ELF::R_390_PLTOFF16: {
+      Kind = systemz::Delta16PLTFromGOT;
+      break;
+    }
+    // Relocations targeting the actual symbol (just relative to the GOT).
+    case ELF::R_390_GOTOFF64: {
+      Kind = systemz::Delta64FromGOT;
+      break;
+    }
+    case ELF::R_390_GOTOFF: {
+      Kind = systemz::Delta32FromGOT;
+      break;
+    }
+    case ELF::R_390_GOTOFF16: {
+      Kind = systemz::Delta16FromGOT;
+      break;
+    }
+    // Relocations targeting the GOT entry associated with the symbol.
+    case ELF::R_390_GOT64:
+    case ELF::R_390_GOTPLT64: {
+      Kind = systemz::RequestGOTAndTransformToDelta64FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT32:
+    case ELF::R_390_GOTPLT32: {
+      Kind = systemz::RequestGOTAndTransformToDelta32FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT20:
+    case ELF::R_390_GOTPLT20: {
+      Kind = systemz::RequestGOTAndTransformToDelta20FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT16:
+    case ELF::R_390_GOTPLT16: {
+      Kind = systemz::RequestGOTAndTransformToDelta16FromGOT;
+      break;
+    }
+    case ELF::R_390_GOT12:
+    case ELF::R_390_GOTPLT12: {
+      Kind = systemz::RequestGOTAndTransformToDelta12FromGOT;
+      break;
+    }
+    case ELF::R_390_GOTENT:
+    case ELF::R_390_GOTPLTENT: {
+      Kind = systemz::RequestGOTAndTransformToDelta32dbl;
+      break;
+    }
+    // R_390_GOTPC and R_390_GOTPCDBL don't create GOT entry, they don't even
+    // have symbol.
+    case ELF::R_390_GOTPC: {
+      Kind = systemz::Delta32GOTBase;
+      break;
+    }
+    case ELF::R_390_GOTPCDBL: {
+      Kind = systemz::Delta32dblGOTBase;
+      break;
+    }
+    default:
+      return make_error<JITLinkError>(
+          "In " + G->getName() + ": Unsupported systemz relocation type " +
+          object::getELFRelocationTypeName(ELF::EM_S390, ELFReloc));
+    }
+    auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
+    Edge GE(Kind, Offset, *GraphSymbol, Addend);
+    LLVM_DEBUG({
+      dbgs() << "    ";
+      printEdge(dbgs(), BlockToFix, GE, systemz::getEdgeKindName(Kind));
+      dbgs() << "\n";
+    });
+
+    BlockToFix.addEdge(std::move(GE));
+
+    return Error::success();
+  }
+
+public:
+  ELFLinkGraphBuilder_systemz(StringRef FileName,
+                              const object::ELFFile<ELFT> &Obj,
+                              std::shared_ptr<orc::SymbolStringPool> SSP,
+                              Triple TT, SubtargetFeatures Features)
+      : ELFLinkGraphBuilder<ELFT>(Obj, std::move(SSP), std::move(TT),
+                                  std::move(Features), FileName,
+                                  systemz::getEdgeKindName) {}
+};
+
+Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromELFObject_systemz(
+    MemoryBufferRef ObjectBuffer, std::shared_ptr<orc::SymbolStringPool> SSP) {
+  LLVM_DEBUG({
+    dbgs() << "Building jitlink graph for new input "
+           << ObjectBuffer.getBufferIdentifier() << "...\n";
+  });
+
+  auto ELFObj = object::ObjectFile::createELFObjectFile(ObjectBuffer);
+  if (!ELFObj)
+    return ELFObj.takeError();
+
+  auto Features = (*ELFObj)->getFeatures();
+  if (!Features)
+    return Features.takeError();
+
+  assert((*ELFObj)->getArch() == Triple::systemz &&
+         "Only SystemZ is supported");
+
+  auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64BE>>(**ELFObj);
+  return ELFLinkGraphBuilder_systemz(
+             (*ELFObj)->getFileName(), ELFObjFile.getELFFile(), std::move(SSP),
+             (*ELFObj)->makeTriple(), std::move(*Features))
+      .buildGraph();
+}
+
+void link_ELF_systemz(std::unique_ptr<LinkGraph> G,
+                      std::unique_ptr<JITLinkContext> Ctx) {
+  PassConfiguration Config;
+  const Triple &TT = G->getTargetTriple();
+  if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+    // Add eh-frame passes.
+    Config.PrePrunePasses.push_back(DWARFRecordSectionSplitter(".eh_frame"));
+    Config.PrePrunePasses.push_back(
+        EHFrameEdgeFixer(".eh_frame", G->getPointerSize(), systemz::Pointer32,
+                         systemz::Pointer64, systemz::Delta32, systemz::Delta64,
+                         systemz::NegDelta32));
+    Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame"));
+
+    // Add a mark-live pass.
+    if (auto MarkLive = Ctx->getMarkLivePass(TT))
+      Config.PrePrunePasses.push_back(std::move(MarkLive));
+    else
+      Config.PrePrunePasses.push_back(markAllSymbolsLive);
+
+    // Add an in-place GOT/Stubs build pass.
+    Config.PostPrunePasses.push_back(buildTables_ELF_systemz);
+
+    // Resolve any external section start / end symbols.
+    Config.PostAllocationPasses.push_back(
+        createDefineExternalSectionStartAndEndSymbolsPass(
+            identifyELFSectionStartAndEndSymbols));
+
+    // TODO: Add GOT/Stubs optimizer pass.
+    // Config.PreFixupPasses.push_back(systemz::optimizeGOTAndStubAccesses);
+  }
+
+  if (auto Err = Ctx->modifyPassConfig(*G, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
+  ELFJITLinker_systemz::link(std::move(Ctx), std::move(G), std::move(Config));
+}
+
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 6e316f105715d..d98ded1ee4c32 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ExecutionEngine/JITLink/XCOFF.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/ExecutionEngine/JITLink/loongarch.h"
+#include "llvm/ExecutionEngine/JITLink/systemz.h"
 #include "llvm/ExecutionEngine/JITLink/x86.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/Support/raw_ostream.h"
@@ -479,6 +480,8 @@ AnonymousPointerCreator getAnonymousPointerCreator(const Triple &TT) {
   case Triple::loongarch32:
   case Triple::loongarch64:
     return loongarch::createAnonymousPointer;
+  case Triple::systemz:
+    return systemz::createAnonymousPointer;
   default:
     return nullptr;
   }
@@ -495,6 +498,8 @@ PointerJumpStubCreator getPointerJumpStubCreator(const Triple &TT) {
   case Triple::loongarch32:
   case Triple::loongarch64:
     return loongarch::createAnonymousPointerJumpStub;
+  case Triple::systemz:
+    return systemz::createAnonymousPointerJumpStub;
   default:
     return nullptr;
   }
diff --git a/llvm/lib/ExecutionEngine/JITLink/systemz.cpp b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
new file mode 100644
index 0000000000000..f6cc29fa6e6a1
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/systemz.cpp
@@ -0,0 +1,114 @@
+//===---- systemz.cpp - Generic JITLink systemz edge kinds, utilities -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Generic utilities for graphs representing systemz objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/systemz.h"
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+namespace systemz {
+
+const char NullPointerContent[8] = {0x00, 0x00, 0x00, 0x00,
+                                    0x00, 0x00, 0x00, 0x00};
+
+const char Pointer64JumpStubContent[8] = {
+    static_cast<char>(0xC4u),
+    0x18,
+    0x00,
+    0x00,
+    0x00,
+    0x00, // lgrl r1
+    static_cast<char>(0x07u),
+    static_cast<char>(0xF1u), // BCR 15, 1
+};
+
+const char *getEdgeKindName(Edge::Kind R) {
+  switch (R) {
+  case Pointer64:
+    return "Pointer64";
+  case Pointer32:
+    return "Pointer32";
+  case Pointer20:
+    return "Pointer20";
+  case Pointer16:
+    return "Pointer16";
+  case Pointer12:
+    return "Pointer12";
+  case Pointer8:
+    return "Pointer8";
+  case Delta64:
+    return "Delta64";
+  case Delta32:
+    return "Delta32";
+  case Delta16:
+    return "Delta16";
+  case Delta32dbl:
+    return "Delta32dbl";
+  case Delta24dbl:
+    return "Delta24dbl";
+  case Delta16dbl:
+    return "Delta16dbl";
+  case Delta12dbl:
+    return "Delta12dbl";
+  case NegDelta64:
+    return "NegDelta64";
+  case NegDelta32:
+    return "NegDelta32";
+  case DeltaPLT32dbl:
+    return "DeltaPLT32dbl";
+  case DeltaPLT24dbl:
+    return "DeltaPLT24dbl";
+  case DeltaPLT16dbl:
+    return "DeltaPLT16dbl";
+  case DeltaPLT12dbl:
+    return "DeltaPLT12dbl";
+  case DeltaPLT64:
+    return "DeltaPLT64";
+  case DeltaPLT32:
+    return "DeltaPLT32";
+  case Delta64FromGOT:
+    return "Delta64FromGOT";
+  case Delta32FromGOT:
+    return "Delta32FromGOT";
+  case Delta16FromGOT:
+    return "Delta16FromGOT";
+  case Delta64PLTFromGOT:
+    return "Delta64PLTFromGOT";
+  case Delta32PLTFromGOT:
+    return "Delta32PLTFromGOT";
+  case Delta16PLTFromGOT:
+    return "Delta16PLTFromGOT";
+  case Delta32GOTBase:
+    return "Delta32GOTBase";
+  case Delta32dblGOTBase:
+    return "Delta32dblGOTBase";
+  case RequestGOTAndTransformToDelta64FromGOT:
+    return "RequestGOTAndTransformToDelta64FromGOT";
+  case RequestGOTAndTransformToDelta32FromGOT:
+    return "RequestGOTAndTransformToDelta32FromGOT";
+  case RequestGOTAndTransformToDelta20FromGOT:
+    return "RequestGOTAndTransformToDelta20FromGOT";
+  case RequestGOTAndTransformToDelta16FromGOT:
+    return "RequestGOTAndTransformToDelta16FromGOT";
+  case RequestGOTAndTransformToDelta12FromGOT:
+    return "RequestGOTAndTransformToDelta12FromGOT";
+  case RequestGOTAndTransformToDelta32dbl:
+    return "RequestGOTAndTransformToDelta32dbl";
+  default:
+    return getGenericEdgeKindName(static_cast<Edge::Kind>(R));
+  }
+}
+
+} // namespace systemz
+} // namespace jitlink
+} // namespace llvm
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index ba3c039503720..d35438d5e1c9c 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -40,7 +40,6 @@
 #include <algorithm>
 #include <cassert>
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 using namespace llvm::at;
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 1f5e78732761d..fd8add4e07832 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/PassRegistry.cpp b/llvm/lib/IR/PassRegistry.cpp
index 94afbb52d70e3..a91bb563af4bb 100644
--- a/llvm/lib/IR/PassRegistry.cpp
+++ b/llvm/lib/IR/PassRegistry.cpp
@@ -17,7 +17,6 @@
 #include "llvm/PassInfo.h"
 #include <cassert>
 #include <memory>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 8e60577bf8fb4..e3ece87778a04 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp
index cd1cee16e7473..3bf52f6ef024e 100644
--- a/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/llvm/lib/IR/ValueSymbolTable.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cassert>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 5dd79946d8779..2a796fb1cfe11 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -21,7 +21,6 @@
 #include <cassert>
 #include <cstdint>
 #include <limits>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index 04e12e56c4262..6e685c60a406e 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -20,7 +20,6 @@
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 911d92c51b59b..c3faab89bb258 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 39542bfbdd8e3..a95ccf83a2636 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -33,7 +33,6 @@
 #include <cassert>
 #include <cstdint>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index 9f9d63f212c88..e2fc32d90f15e 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -24,7 +24,6 @@
 #include <cstddef>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index 0450b2fd172ef..20398b5f582f4 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -25,7 +25,6 @@
 #include <map>
 #include <set>
 #include <string>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/TableGen/DetailedRecordsBackend.cpp b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
index 1ed64356b7c62..b1152bf680c69 100644
--- a/llvm/lib/TableGen/DetailedRecordsBackend.cpp
+++ b/llvm/lib/TableGen/DetailedRecordsBackend.cpp
@@ -22,7 +22,6 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <string>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index a67bd42aa16e0..d87bb522c99e8 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -46,7 +46,6 @@
 #include "llvm/Transforms/Utils/MemoryTaggingSupport.h"
 #include <cassert>
 #include <memory>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index dad94b83aa84f..8838a94a639eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -52,7 +52,7 @@ struct ArgDescriptor {
   }
 
   static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
-    return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
+    return ArgDescriptor(Arg.Reg.id(), Mask, Arg.IsStack, Arg.IsSet);
   }
 
   bool isSet() const {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e3f3abae01648..dd3120f05ce26 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1199,8 +1199,8 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
 
 // Given a wide tuple \p Reg check if it will overflow 256 registers.
 // \returns \p Reg on success or NoRegister otherwise.
-static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
-                                  const MCRegisterInfo &MRI) {
+static MCRegister CheckVGPROverflow(MCRegister Reg, const MCRegisterClass &RC,
+                                    const MCRegisterInfo &MRI) {
   unsigned NumRegs = RC.getSizeInBits() / 32;
   MCRegister Sub0 = MRI.getSubReg(Reg, AMDGPU::sub0);
   if (!Sub0)
@@ -1214,7 +1214,7 @@ static unsigned CheckVGPROverflow(unsigned Reg, const MCRegisterClass &RC,
 
   assert(BaseReg && "Only vector registers expected");
 
-  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : AMDGPU::NoRegister;
+  return (Sub0 - BaseReg + NumRegs <= 256) ? Reg : MCRegister();
 }
 
 // Note that before gfx10, the MIMG encoding provided no information about
@@ -1456,9 +1456,8 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V,
   return MCOperand();
 }
 
-inline
-MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const {
-  return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI));
+inline MCOperand AMDGPUDisassembler::createRegOperand(MCRegister Reg) const {
+  return MCOperand::createReg(AMDGPU::getMCReg(Reg, STI));
 }
 
 inline
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index d103d79fdabb9..ab130dbb08ff9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -69,7 +69,7 @@ class AMDGPUDisassembler : public MCDisassembler {
 
   const char* getRegClassName(unsigned RegClassID) const;
 
-  MCOperand createRegOperand(unsigned int RegId) const;
+  MCOperand createRegOperand(MCRegister Reg) const;
   MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
   MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
   MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a4befa5..207c1da56ca59 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -336,7 +336,7 @@ void AMDGPUInstPrinter::printSymbolicFormat(const MCInst *MI,
 
 // \returns a low 256 vgpr representing a high vgpr \p Reg [v256..v1023] or
 // \p Reg itself otherwise.
-static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+static MCRegister getRegForPrinting(MCRegister Reg, const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   if (Idx < 0x100)
@@ -355,10 +355,10 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) {
 }
 
 // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis.
-static MCPhysReg getRegFromMIA(MCPhysReg Reg, unsigned OpNo,
-                               const MCInstrDesc &Desc,
-                               const MCRegisterInfo &MRI,
-                               const AMDGPUMCInstrAnalysis &MIA) {
+static MCRegister getRegFromMIA(MCRegister Reg, unsigned OpNo,
+                                const MCInstrDesc &Desc,
+                                const MCRegisterInfo &MRI,
+                                const AMDGPUMCInstrAnalysis &MIA) {
   unsigned VgprMSBs = MIA.getVgprMSBs();
   if (!VgprMSBs)
     return Reg;
@@ -403,10 +403,10 @@ void AMDGPUInstPrinter::printRegOperand(MCRegister Reg, raw_ostream &O,
   }
 #endif
 
-  unsigned PrintReg = getRegForPrinting(Reg, MRI);
+  MCRegister PrintReg = getRegForPrinting(Reg, MRI);
   O << getRegisterName(PrintReg);
 
-  if (PrintReg != Reg.id())
+  if (PrintReg != Reg)
     O << " /*" << getRegisterName(Reg) << "*/";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2c689cc8bcbef..ae86d1382382f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17389,12 +17389,14 @@ void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
     // Abandon attempt if the dst size isn't large enough
     // - this is in fact an error but this is picked up elsewhere and
     // reported correctly.
-    uint32_t DstSize =
-        TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+
+    uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
     if (DstSize < InitIdx)
       return;
   } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
-    InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+    const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
+    InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
   } else {
     return;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b3c351a5ba6ce..259442fe774bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7686,6 +7686,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   unsigned Opcode = Inst.getOpcode();
   unsigned NewOpcode = getVALUOp(Inst);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
   // Handle some special cases
   switch (Opcode) {
   default:
@@ -7923,7 +7925,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     return;
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     MachineOperand &Dest0 = Inst.getOperand(0);
     MachineOperand &Dest1 = Inst.getOperand(1);
     MachineOperand &Src0 = Inst.getOperand(2);
@@ -7943,12 +7944,37 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
 
     legalizeOperands(*NewInstr, MDT);
     MRI.replaceRegWith(Dest0.getReg(), DestReg);
-    addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
-                                 Worklist);
+    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
     Inst.eraseFromParent();
   }
     return;
+  case AMDGPU::S_LSHL1_ADD_U32:
+  case AMDGPU::S_LSHL2_ADD_U32:
+  case AMDGPU::S_LSHL3_ADD_U32:
+  case AMDGPU::S_LSHL4_ADD_U32: {
+    MachineOperand &Dest = Inst.getOperand(0);
+    MachineOperand &Src0 = Inst.getOperand(1);
+    MachineOperand &Src1 = Inst.getOperand(2);
+    unsigned ShiftAmt = (Opcode == AMDGPU::S_LSHL1_ADD_U32   ? 1
+                         : Opcode == AMDGPU::S_LSHL2_ADD_U32 ? 2
+                         : Opcode == AMDGPU::S_LSHL3_ADD_U32 ? 3
+                                                             : 4);
+
+    const TargetRegisterClass *NewRC =
+        RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()));
+    Register DestReg = MRI.createVirtualRegister(NewRC);
+    MachineInstr *NewInstr =
+        BuildMI(*MBB, &Inst, DL, get(AMDGPU::V_LSHL_ADD_U32_e64), DestReg)
+            .add(Src0)
+            .addImm(ShiftAmt)
+            .add(Src1);
 
+    legalizeOperands(*NewInstr, MDT);
+    MRI.replaceRegWith(Dest.getReg(), DestReg);
+    addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+    Inst.eraseFromParent();
+  }
+    return;
   case AMDGPU::S_CSELECT_B32:
   case AMDGPU::S_CSELECT_B64:
     lowerSelect(Worklist, Inst, MDT);
@@ -8045,7 +8071,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     return;
   }
   case AMDGPU::S_CVT_HI_F32_F16: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     if (ST.useRealTrue16Insts()) {
@@ -8075,7 +8100,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
   case AMDGPU::S_MINIMUM_F32:
   case AMDGPU::S_MAXIMUM_F32: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
                                  .addImm(0) // src0_modifiers
@@ -8093,7 +8117,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   }
   case AMDGPU::S_MINIMUM_F16:
   case AMDGPU::S_MAXIMUM_F16: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
@@ -8117,7 +8140,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
   case AMDGPU::V_S_RCP_F16_e64:
   case AMDGPU::V_S_RSQ_F16_e64:
   case AMDGPU::V_S_SQRT_F16_e64: {
-    const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
                                                     ? &AMDGPU::VGPR_16RegClass
                                                     : &AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b7f63eceb5d5c..0bde5d3fd2f26 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -776,11 +776,7 @@ def xnor : PatFrag <
 foreach I = 1-4 in {
 def shl#I#_add : PatFrag <
   (ops node:$src0, node:$src1),
-  (add (shl_oneuse $src0, (i32 I)), $src1)> {
-  // FIXME: Poor substitute for disabling pattern in SelectionDAG
-  let PredicateCode = [{return false;}];
-  let GISelPredicateCode = [{return true;}];
-}
+  (add (shl_oneuse $src0, (i32 I)), $src1)>;
 }
 
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 48fbf228cfbe5..dde425e183a2f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3248,7 +3248,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     if (!IsMUBUF && !MFI->isBottomOfStack()) {
       // Convert to a swizzled stack address by scaling by the wave size.
       // In an entry function/kernel the offset is already swizzled.
-      bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
+      bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
       bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
                      !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
       const TargetRegisterClass *RC = IsSALU && !LiveSCC
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3e1b058726dbb..37bf2d2463ae2 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -897,7 +897,7 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
 }
 
 std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
-    std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+    std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
     const MCRegisterInfo &MRI, bool SkipSrc, bool AllowSameVGPR,
     bool VOPD3) const {
 
@@ -914,12 +914,13 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
       BaseX = X;
     if (!BaseY)
       BaseY = Y;
-    if ((BaseX & BanksMask) == (BaseY & BanksMask))
+    if ((BaseX.id() & BanksMask) == (BaseY.id() & BanksMask))
       return true;
     if (BaseX != X /* This is 64-bit register */ &&
-        ((BaseX + 1) & BanksMask) == (BaseY & BanksMask))
+        ((BaseX.id() + 1) & BanksMask) == (BaseY.id() & BanksMask))
       return true;
-    if (BaseY != Y && (BaseX & BanksMask) == ((BaseY + 1) & BanksMask))
+    if (BaseY != Y &&
+        (BaseX.id() & BanksMask) == ((BaseY.id() + 1) & BanksMask))
       return true;
 
     // If both are 64-bit bank conflict will be detected yet while checking
@@ -968,7 +969,7 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
 // if the operand is not a register or not a VGPR.
 InstInfo::RegIndices
 InstInfo::getRegIndices(unsigned CompIdx,
-                        std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                        std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
                         bool VOPD3) const {
   assert(CompIdx < COMPONENTS_NUM);
 
@@ -983,7 +984,7 @@ InstInfo::getRegIndices(unsigned CompIdx,
         Comp.hasRegSrcOperand(CompSrcIdx)
             ? GetRegIdx(CompIdx,
                         Comp.getIndexOfSrcInMCOperands(CompSrcIdx, VOPD3))
-            : 0;
+            : MCRegister();
   }
   return RegIndices;
 }
@@ -2697,8 +2698,8 @@ MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI) {
 
 MCRegister mc2PseudoReg(MCRegister Reg) { MAP_REG2REG }
 
-bool isInlineValue(unsigned Reg) {
-  switch (Reg) {
+bool isInlineValue(MCRegister Reg) {
+  switch (Reg.id()) {
   case AMDGPU::SRC_SHARED_BASE_LO:
   case AMDGPU::SRC_SHARED_BASE:
   case AMDGPU::SRC_SHARED_LIMIT_LO:
@@ -3361,7 +3362,7 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
                           : getGfx9BufferFormatInfo(Format);
 }
 
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
                                            const MCRegisterInfo &MRI) {
   const unsigned VGPRClasses[] = {
       AMDGPU::VGPR_16RegClassID,  AMDGPU::VGPR_32RegClassID,
@@ -3382,22 +3383,22 @@ const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
   return nullptr;
 }
 
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI) {
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   return Idx >> 8;
 }
 
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
-                          const MCRegisterInfo &MRI) {
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+                           const MCRegisterInfo &MRI) {
   unsigned Enc = MRI.getEncodingValue(Reg);
   unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
   if (Idx >= 0x100)
-    return AMDGPU::NoRegister;
+    return MCRegister();
 
   const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI);
   if (!RC)
-    return AMDGPU::NoRegister;
+    return MCRegister();
 
   Idx |= MSBs << 8;
   if (RC->getID() == AMDGPU::VGPR_16RegClassID) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 5e3195b36fe4c..9f65f9326a73e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -909,7 +909,7 @@ class InstInfo {
   const ComponentInfo CompInfo[COMPONENTS_NUM];
 
 public:
-  using RegIndices = std::array<unsigned, Component::MAX_OPR_NUM>;
+  using RegIndices = std::array<MCRegister, Component::MAX_OPR_NUM>;
 
   InstInfo(const MCInstrDesc &OpX, const MCInstrDesc &OpY)
       : CompInfo{OpX, OpY} {}
@@ -932,9 +932,10 @@ class InstInfo {
   // even though it violates requirement to be from different banks.
   // If \p VOPD3 is set to true both dst registers allowed to be either odd
   // or even and instruction may have real src2 as opposed to tied accumulator.
-  bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
-                         const MCRegisterInfo &MRI, bool SkipSrc = false,
-                         bool AllowSameVGPR = false, bool VOPD3 = false) const {
+  bool
+  hasInvalidOperand(std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
+                    const MCRegisterInfo &MRI, bool SkipSrc = false,
+                    bool AllowSameVGPR = false, bool VOPD3 = false) const {
     return getInvalidCompOperandIndex(GetRegIdx, MRI, SkipSrc, AllowSameVGPR,
                                       VOPD3)
         .has_value();
@@ -949,14 +950,14 @@ class InstInfo {
   // If \p VOPD3 is set to true both dst registers allowed to be either odd
   // or even and instruction may have real src2 as opposed to tied accumulator.
   std::optional<unsigned> getInvalidCompOperandIndex(
-      std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+      std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
       const MCRegisterInfo &MRI, bool SkipSrc = false,
       bool AllowSameVGPR = false, bool VOPD3 = false) const;
 
 private:
   RegIndices
   getRegIndices(unsigned ComponentIdx,
-                std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+                std::function<MCRegister(unsigned, unsigned)> GetRegIdx,
                 bool VOPD3) const;
 };
 
@@ -1599,7 +1600,7 @@ LLVM_READNONE
 MCRegister mc2PseudoReg(MCRegister Reg);
 
 LLVM_READNONE
-bool isInlineValue(unsigned Reg);
+bool isInlineValue(MCRegister Reg);
 
 /// Is this an AMDGPU specific source operand? These include registers,
 /// inline constants, literals and mandatory literals (KImm).
@@ -1798,16 +1799,16 @@ bool isIntrinsicAlwaysUniform(unsigned IntrID);
 
 /// \returns a register class for the physical register \p Reg if it is a VGPR
 /// or nullptr otherwise.
-const MCRegisterClass *getVGPRPhysRegClass(MCPhysReg Reg,
+const MCRegisterClass *getVGPRPhysRegClass(MCRegister Reg,
                                            const MCRegisterInfo &MRI);
 
 /// \returns the MODE bits which have to be set by the S_SET_VGPR_MSB for the
 /// physical register \p Reg.
-unsigned getVGPREncodingMSBs(MCPhysReg Reg, const MCRegisterInfo &MRI);
+unsigned getVGPREncodingMSBs(MCRegister Reg, const MCRegisterInfo &MRI);
 
 /// If \p Reg is a low VGPR return a corresponding high VGPR with \p MSBs set.
-MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs,
-                          const MCRegisterInfo &MRI);
+MCRegister getVGPRWithMSBs(MCRegister Reg, unsigned MSBs,
+                           const MCRegisterInfo &MRI);
 
 // Returns a table for the opcode with a given \p Desc to map the VGPR MSB
 // set by the S_SET_VGPR_MSB to one of 4 sources. In case of VOPD returns 2
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index f43ec73db7e1f..80494d993f425 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -51,7 +51,6 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 72eb3d0f8b7f4..b6897608a952c 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index e81bb4745faff..98798275e7979 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -476,7 +476,7 @@ bool CSKYFrameLowering::spillCalleeSavedRegisters(
     // Insert the spill to the stack frame.
     MCRegister Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI,
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC,
                             Register());
   }
 
@@ -498,8 +498,7 @@ bool CSKYFrameLowering::restoreCalleeSavedRegisters(
   for (auto &CS : reverse(CSI)) {
     MCRegister Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                             Register());
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register());
     assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
   }
 
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index 34a7de8d8ae96..3ab09902be3aa 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -24,8 +24,9 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "CSKYGenInstrInfo.inc"
 
-CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI)
-    : CSKYGenInstrInfo(STI, RI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
+CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI,
+                             const CSKYRegisterInfo &TRI)
+    : CSKYGenInstrInfo(STI, TRI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP),
       STI(STI) {
   v2sf = STI.hasFPUv2SingleFloat();
   v2df = STI.hasFPUv2DoubleFloat();
@@ -393,7 +394,6 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         Register SrcReg, bool IsKill, int FI,
                                         const TargetRegisterClass *RC,
-                                        const TargetRegisterInfo *TRI,
                                         Register VReg,
                                         MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
@@ -434,10 +434,12 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addMemOperand(MMO);
 }
 
-void CSKYInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
-    int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-    Register VReg, MachineInstr::MIFlag Flags) const {
+void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         Register DestReg, int FI,
+                                         const TargetRegisterClass *RC,
+                                         Register VReg,
+                                         MachineInstr::MIFlag Flags) const {
   DebugLoc DL;
   if (I != MBB.end())
     DL = I->getDebugLoc();
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 6451c0af14fc0..d1cd0395f3b95 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -21,6 +21,7 @@
 
 namespace llvm {
 
+class CSKYRegisterInfo;
 class CSKYSubtarget;
 
 class CSKYInstrInfo : public CSKYGenInstrInfo {
@@ -33,7 +34,7 @@ class CSKYInstrInfo : public CSKYGenInstrInfo {
   const CSKYSubtarget &STI;
 
 public:
-  explicit CSKYInstrInfo(const CSKYSubtarget &STI);
+  CSKYInstrInfo(const CSKYSubtarget &STI, const CSKYRegisterInfo &RI);
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -42,14 +43,12 @@ class CSKYInstrInfo : public CSKYGenInstrInfo {
 
   void storeRegToStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
-      bool IsKill, int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void loadRegFromStackSlot(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg,
-      int FrameIndex, const TargetRegisterClass *RC,
-      const TargetRegisterInfo *TRI, Register VReg,
+      int FrameIndex, const TargetRegisterClass *RC, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
index a554d1c0e739b..94e412ec81725 100644
--- a/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.cpp
@@ -92,7 +92,7 @@ CSKYSubtarget::CSKYSubtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
                              StringRef FS, const TargetMachine &TM)
     : CSKYGenSubtargetInfo(TT, CPU, TuneCPU, FS),
       FrameLowering(initializeSubtargetDependencies(TT, CPU, TuneCPU, FS)),
-      InstrInfo(*this), RegInfo(), TLInfo(TM, *this) {
+      InstrInfo(*this, RegInfo), TLInfo(TM, *this) {
   TSInfo = std::make_unique<CSKYSelectionDAGInfo>();
 }
 
diff --git a/llvm/lib/Target/CSKY/CSKYSubtarget.h b/llvm/lib/Target/CSKY/CSKYSubtarget.h
index a3f2ddcb7165b..f5ad26a20d8a5 100644
--- a/llvm/lib/Target/CSKY/CSKYSubtarget.h
+++ b/llvm/lib/Target/CSKY/CSKYSubtarget.h
@@ -30,8 +30,8 @@ class CSKYSubtarget : public CSKYGenSubtargetInfo {
   virtual void anchor();
 
   CSKYFrameLowering FrameLowering;
-  CSKYInstrInfo InstrInfo;
   CSKYRegisterInfo RegInfo;
+  CSKYInstrInfo InstrInfo;
   CSKYTargetLowering TLInfo;
   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index eca5ac140f3c3..bae3484eee1cb 100644
--- a/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -24,7 +24,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index 74e5abe2599c7..c6fffde84af58 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -43,7 +43,6 @@
 #include <cassert>
 #include <iterator>
 #include <limits>
-#include <utility>
 
 #define DEBUG_TYPE "hexmux"
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 9c81e9638f8e2..5344ed8446efc 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -30,7 +30,6 @@
 #include <cassert>
 #include <iterator>
 #include <queue>
-#include <utility>
 
 #define DEBUG_TYPE "gen-pred"
 
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 54f5608d460af..f375b25e4ceb8 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <limits>
-#include <utility>
 
 using namespace llvm;
 using namespace rdf;
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index 3b1d3bd89680b..4cab5da7b1caf 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
-#include <utility>
 
 using namespace llvm;
 using namespace rdf;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index e1b37fd88f44d..9bbb3aad89c44 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -89,7 +89,6 @@
 #include <cstdint>
 #include <cstring>
 #include <string>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 98c873824bc1d..a2b75e4a42e76 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -11,7 +11,6 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCFixup.h"
-#include <utility>
 
 #undef RISCV
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
index a329fd5ed9d29..c99d603d340ea 100644
--- a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
+++ b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h
@@ -22,8 +22,6 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 
-#include <type_traits>
-
 namespace llvm {
 namespace SPIRV {
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5bce539c45341..fa3dce256046f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53412,7 +53412,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
     return SDValue();
 
   // SrcVal must be a matching normal load further up the chain.
-  auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+  auto *Ld = dyn_cast<LoadSDNode>(peekThroughBitcasts(SrcVal));
   if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
       Ld->getBasePtr() != St->getBasePtr() ||
       Ld->getOffset() != St->getOffset() ||
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7355716..70564973816b1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
 defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
 defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
 
+// 8-bit mask set operations for AVX512DQ
+let Predicates = [HasDQI] in {
+  defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
+  defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
+}
+
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
@@ -3173,6 +3179,34 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v1i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper
+// bits
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
+  def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
+}
+
+// Optimize bitconvert of all-ones constants to use kxnor instructions
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
+  def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
+  def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
+}
+// Submask patterns: lower N bits set in larger mask registers
+let Predicates = [HasBWI, HasDQI] in {
+  // v32i1 submasks
+  def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>;
+  def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>;
+  // v64i1 submasks
+  def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>;
+  def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>;
+  def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D),
+                                                     VK64)>;
+}
+
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
 multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
                                              RegisterClass RC, ValueType VT> {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 61d9608160197..cb0208a4a5f32 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -788,9 +788,11 @@ bool X86InstrInfo::isReMaterializableImpl(
   case X86::FsFLD0SS:
   case X86::FsFLD0SH:
   case X86::FsFLD0F128:
+  case X86::KSET0B:
   case X86::KSET0D:
   case X86::KSET0Q:
   case X86::KSET0W:
+  case X86::KSET1B:
   case X86::KSET1D:
   case X86::KSET1Q:
   case X86::KSET1W:
@@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   // registers, since it is not usable as a write mask.
   // FIXME: A more advanced approach would be to choose the best input mask
   // register based on context.
+  case X86::KSET0B:
+    return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
   case X86::KSET0W:
     return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
   case X86::KSET0D:
     return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
   case X86::KSET0Q:
     return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
+  case X86::KSET1B:
+    return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
   case X86::KSET1W:
     return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
   case X86::KSET1D:
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index d0d897e6784d3..829a32eb37118 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -54,7 +54,6 @@
 #include <cassert>
 #include <iterator>
 #include <optional>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 1099aa335e4c5..0c8b9043fcbbb 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -65,7 +65,6 @@
 #include <cassert>
 #include <list>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 0bc5964132b72..20cfbe460a351 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -63,7 +63,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <utility>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 94dfd3a974923..5b94897f4342f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -66,7 +66,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <numeric>
-#include <type_traits>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index ca90bb65f5708..1e614bd29ee6e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -53,7 +53,6 @@
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <assert.h>
 #include <memory>
-#include <type_traits>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3840b464e6c2c..72858e1265d86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1787,6 +1787,12 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
     return getOperand(I + 1)->isDefinedOutsideLoopRegions();
   }
 
+  bool areAllOperandsInvariant() const {
+    return all_of(operands(), [](VPValue *Op) {
+      return Op->isDefinedOutsideLoopRegions();
+    });
+  }
+
 public:
   VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands)
       : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ba145ffa0b681..9a5591feb3d05 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2523,32 +2523,51 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.
 
-  assert(
-      any_of(operands(),
-             [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) &&
-      "Expected at least one loop-variant operand");
-
-  // If the GEP has at least one loop-varying operand, we are sure to
-  // produce a vector of pointers unless VF is scalar.
-  // The pointer operand of the new GEP. If it's loop-invariant, we
-  // won't broadcast it.
-  auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
-
-  // Collect all the indices for the new GEP. If any index is
-  // loop-invariant, we won't broadcast it.
-  SmallVector<Value *, 4> Indices;
-  for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
-    VPValue *Operand = getOperand(I);
-    Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
-  }
-
-  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-  // but it should be a vector, otherwise.
-  auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
-                                         "", getGEPNoWrapFlags());
-  assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
-         "NewGEP is not a pointer vector");
-  State.set(this, NewGEP);
+  if (areAllOperandsInvariant()) {
+    // If we are vectorizing, but the GEP has only loop-invariant operands,
+    // the GEP we build (by only using vector-typed operands for
+    // loop-varying values) would be a scalar pointer. Thus, to ensure we
+    // produce a vector of pointers, we need to either arbitrarily pick an
+    // operand to broadcast, or broadcast a clone of the original GEP.
+    // Here, we broadcast a clone of the original.
+    //
+    // TODO: If at some point we decide to scalarize instructions having
+    //       loop-invariant operands, this special case will no longer be
+    //       required. We would add the scalarization decision to
+    //       collectLoopScalars() and teach getVectorValue() to broadcast
+    //       the lane-zero scalar value.
+    SmallVector<Value *> Ops;
+    for (unsigned I = 0, E = getNumOperands(); I != E; I++)
+      Ops.push_back(State.get(getOperand(I), VPLane(0)));
+
+    auto *NewGEP =
+        State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
+                                "", getGEPNoWrapFlags());
+    Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
+    State.set(this, Splat);
+  } else {
+    // If the GEP has at least one loop-varying operand, we are sure to
+    // produce a vector of pointers unless VF is scalar.
+    // The pointer operand of the new GEP. If it's loop-invariant, we
+    // won't broadcast it.
+    auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
+
+    // Collect all the indices for the new GEP. If any index is
+    // loop-invariant, we won't broadcast it.
+    SmallVector<Value *, 4> Indices;
+    for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+      VPValue *Operand = getOperand(I);
+      Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
+    }
+
+    // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+    // but it should be a vector, otherwise.
+    auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
+                                           "", getGEPNoWrapFlags());
+    assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+           "NewGEP is not a pointer vector");
+    State.set(this, NewGEP);
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 10afd006c90c9..1b9bc4cc45163 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1387,8 +1387,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPWidenGEPRecipe,
-               VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
diff --git a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
new file mode 100644
index 0000000000000..dc23f51987635
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O3 < %s -mtriple=aarch64 | FileCheck %s
+
+; The seemingly redundant mov where src_reg == dst_reg shouldn't be removed,
+; because it has the effect of zeroing the upper bits in x8.
+
+define i32 @ham(i32 %arg, i1 %arg1, i1 %arg2, ptr %arg3) nounwind {
+; CHECK-LABEL: ham:
+; CHECK:       // %bb.0: // %bb
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    tbnz w1, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %bb4
+; CHECK-NEXT:    tbnz w2, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %bb5
+; CHECK-NEXT:    mov x19, x3
+; CHECK-NEXT:    mov w21, w1
+; CHECK-NEXT:    mov w20, w0
+; CHECK-NEXT:    bl zot
+; CHECK-NEXT:    tbz w21, #0, .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %bb6
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov w8, w20
+; CHECK-NEXT:    mov w20, wzr
+; CHECK-NEXT:    mov w8, w8
+; CHECK-NEXT:    mov w21, w8
+; CHECK-NEXT:  .LBB0_5: // %bb7
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    strb w20, [x19]
+; CHECK-NEXT:    cbnz x21, .LBB0_5
+; CHECK-NEXT:  // %bb.6: // %bb8
+; CHECK-NEXT:    // in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    bl quux
+; CHECK-NEXT:    b .LBB0_5
+bb:
+  br i1 %arg1, label %bb6, label %bb4
+
+bb4:
+  %load = load ptr, ptr null, align 8
+  br i1 %arg2, label %bb6, label %bb5
+
+bb5:
+  %call = call i32 @zot() #0
+  %zext = zext i32 %arg to i64
+  br i1 %arg1, label %bb6, label %bb7
+
+bb6:
+  ret i32 0
+
+bb7:
+  store i8 0, ptr %arg3, align 1
+  %icmp = icmp eq i64 %zext, 0
+  br i1 %icmp, label %bb8, label %bb7
+
+bb8:
+  call void @quux()
+  br label %bb7
+}
+
+declare i32 @zot()
+
+declare void @quux()
+
+attributes #0 = { returns_twice }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index 881cf977eab90..16ba79403cef2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -52,16 +52,14 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s32, 16
-; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
-; GFX12-NEXT:    s_mov_b32 s1, s32
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s0, s0, -16
-; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_mov_b32 s1, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
 ; GFX12-NEXT:    s_endpgm
@@ -70,8 +68,7 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b32 s32, 16
-; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX942-NEXT:    s_and_b32 s0, s0, -16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_lshl_b32 s0, s0, 6
@@ -211,18 +208,16 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
 ; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
-; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s0, s0, -16
 ; GFX12-NEXT:    s_mov_b32 s1, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX12-NEXT:    scratch_store_b32 off, v0, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
@@ -232,8 +227,7 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b32 s32, 16
-; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX942-NEXT:    s_and_b32 s0, s0, -16
 ; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
 ; GFX942-NEXT:    s_getpc_b64 s[0:1]
@@ -395,15 +389,13 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
 ; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
-; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
-; GFX12-NEXT:    s_mov_b32 s4, s32
-; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_b32 s0, s0, -16
-; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_mov_b32 s4, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_add_co_i32 s32, s4, s0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -416,8 +408,7 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    s_mov_b32 s32, 16
-; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX942-NEXT:    s_and_b32 s0, s0, -16
 ; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
 ; GFX942-NEXT:    s_getpc_b64 s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 9a4040a25419a..49977a4c64784 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -265,8 +265,7 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX9-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-NEXT:    s_lshl2_add_u32 s0, s2, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16
@@ -282,9 +281,8 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s0, s0, s2
+; GFX11-NEXT:    s_lshl2_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index ed767aeaf112f..6dad2258781b3 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -13,8 +13,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
@@ -53,12 +52,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -88,13 +86,12 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_uniform_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_load_dword s4, s[8:9], 0x0
-; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
+; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-SDAG-NEXT:    s_add_i32 s5, s32, 0x1fff
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, 0xffffe000
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 10
@@ -137,12 +134,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -178,8 +174,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s32
 ; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
@@ -218,12 +213,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -367,26 +361,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() {
 define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s17
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -609,8 +603,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    s_movk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
-; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
@@ -639,8 +632,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s5
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB6_4: ; %bb.1
-; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s4, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s4, s4, 6
@@ -719,20 +711,17 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s3, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_add_i32 s1, s1, 15
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_and_b32 s4, s1, -16
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_and_b32 s1, s3, 0xfffff800
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s3, s4, 5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:  .LBB6_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
@@ -750,18 +739,16 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:  .LBB6_4: ; %bb.1
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -866,9 +853,8 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
 ; GFX9-SDAG-NEXT:  .LBB7_4: ; %bb.0
-; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 2
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0xfff
-; GFX9-SDAG-NEXT:    s_add_i32 s5, s5, 15
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s5, s5, 15
 ; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xfffff000
 ; GFX9-SDAG-NEXT:    s_and_b32 s5, s5, -16
 ; GFX9-SDAG-NEXT:    s_lshl_b32 s5, s5, 6
@@ -964,16 +950,15 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB7_5
 ; GFX11-SDAG-NEXT:  .LBB7_4: ; %bb.0
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s1, 2
+; GFX11-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 15
-; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff800
-; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff800
+; GFX11-SDAG-NEXT:    s_lshl_b32 s1, s1, 5
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s1, s0
+; GFX11-SDAG-NEXT:    s_add_i32 s32, s0, s1
 ; GFX11-SDAG-NEXT:  .LBB7_5: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_endpgm
 ; GFX11-SDAG-NEXT:  .LBB7_6:
@@ -1173,35 +1158,35 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
+; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s6, s4, 0xffffe000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
+; GFX9-SDAG-NEXT:    s_mov_b32 s7, 0
 ; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s7
-; GFX9-SDAG-NEXT:    s_max_u32 s6, s6, s8
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s8, s[4:5]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s9, v0, s8
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[4:5], s8
+; GFX9-SDAG-NEXT:    s_max_u32 s7, s7, s9
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-SDAG-NEXT:  ; %bb.2:
-; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
-; GFX9-SDAG-NEXT:    s_and_b32 s4, s4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s6, 6, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s7, 6, v0
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1241,37 +1226,36 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
-; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
 ; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
+; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX11-SDAG-NEXT:  ; %bb.2:
-; GFX11-SDAG-NEXT:    s_add_i32 s1, s32, 0xfff
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s0
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 10
-; GFX11-SDAG-NEXT:    s_and_b32 s1, s1, 0xfffff000
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
-; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1861,20 +1845,20 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.0
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
 ; GFX9-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v1, s9
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s11
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s11, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s12, v1, s11
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s11
+; GFX9-SDAG-NEXT:    s_max_u32 s10, s10, s12
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
 ; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s10, 6, v1
 ; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
@@ -1905,7 +1889,6 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:  .LBB14_6: ; %bb.1
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
@@ -1923,7 +1906,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 1
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s14
@@ -2027,27 +2011,26 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
 ; GFX11-SDAG-NEXT:  .LBB14_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s2
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s5, s4
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v1, s4
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s4
-; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s5
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s4, s5
+; GFX11-SDAG-NEXT:    s_max_u32 s3, s3, s6
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
-; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v2, s3, 5, s2
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0x1ff0, v1
@@ -2070,28 +2053,27 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB14_6: ; %bb.1
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v0, 2, 15
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-SDAG-NEXT:  .LBB14_7: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB14_7
 ; GFX11-SDAG-NEXT:  ; %bb.8:
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s33 dlc
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s33 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v2, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
@@ -2200,9 +2182,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GFX9-SDAG-NEXT:    s_mov_b32 s12, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s13, s34
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
 ; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
@@ -2212,24 +2194,24 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %bb.1
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX9-SDAG-NEXT:    s_and_b32 s9, s6, 0xfffff000
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s9, s[6:7]
-; GFX9-SDAG-NEXT:    v_readlane_b32 s10, v1, s9
-; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s9
-; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s10
+; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s10, s[6:7]
+; GFX9-SDAG-NEXT:    v_readlane_b32 s11, v0, s10
+; GFX9-SDAG-NEXT:    s_bitset0_b64 s[6:7], s10
+; GFX9-SDAG-NEXT:    s_max_u32 s8, s8, s11
 ; GFX9-SDAG-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; GFX9-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
 ; GFX9-SDAG-NEXT:  ; %bb.3:
-; GFX9-SDAG-NEXT:    s_add_i32 s6, s32, 0xfff
-; GFX9-SDAG-NEXT:    s_and_b32 s6, s6, 0xfffff000
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-SDAG-NEXT:    v_lshl_add_u32 v2, s8, 6, v1
-; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v2
-; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-SDAG-NEXT:    v_lshl_add_u32 v1, s8, 6, v0
+; GFX9-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    ; implicit-def: $vgpr31
 ; GFX9-SDAG-NEXT:  .LBB15_4: ; %Flow
@@ -2259,8 +2241,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX9-SDAG-NEXT:    s_mov_b32 s34, s12
-; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s13
+; GFX9-SDAG-NEXT:    s_mov_b32 s33, s12
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
@@ -2332,9 +2314,9 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
-; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s7, s34
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
 ; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
@@ -2344,28 +2326,28 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_4
 ; GFX11-SDAG-NEXT:  ; %bb.1: ; %bb.1
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, v1, 2, 15
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v1, 2, 15
+; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-SDAG-NEXT:  .LBB15_2: ; =>This Inner Loop Header: Depth=1
-; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
+; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    v_readlane_b32 s4, v1, s3
-; GFX11-SDAG-NEXT:    s_bitset0_b32 s2, s3
-; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s4
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11-SDAG-NEXT:    s_bitset0_b32 s3, s4
+; GFX11-SDAG-NEXT:    s_max_u32 s1, s1, s5
+; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB15_2
 ; GFX11-SDAG-NEXT:  ; %bb.3:
-; GFX11-SDAG-NEXT:    s_add_i32 s2, s32, 0x7ff
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s1, 5, s2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr31
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SDAG-NEXT:    s_and_b32 s2, s2, 0xfffff800
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v1, s1, 5, s2
-; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s2 dlc
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
 ; GFX11-SDAG-NEXT:  .LBB15_4: ; %Flow
 ; GFX11-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB15_8
@@ -2394,8 +2376,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
-; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s7
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s6
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index b0e6752386285..e01cb79382c05 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -524,7 +524,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -695,7 +695,7 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 2, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -875,7 +875,7 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX942-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 4, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1054,7 +1054,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -1225,7 +1225,7 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 2, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1405,7 +1405,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, 0
 ; GFX942-SDAG-NEXT:    v_mad_u32_u24 v0, v0, 4, s0
 ; GFX942-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:1 sc0 sc1
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fc8883924dfbc..870b679a84d11 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -857,13 +857,13 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    s_mov_b32 s0, s32
+; GFX9-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -915,13 +915,13 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    s_mov_b32 s0, s32
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -929,8 +929,8 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX942-LABEL: store_load_vindex_foo:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v1, s32
-; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX942-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX942-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -2146,16 +2146,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    s_mov_b32 s0, s1
+; GFX9-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2214,16 +2214,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    s_mov_b32 s0, s1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -2231,11 +2231,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX942-LABEL: store_load_vindex_small_offset_foo:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX942-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT:    s_mov_b32 s0, s1
+; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX942-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX942-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -3447,16 +3447,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    s_mov_b32 s0, s1
+; GFX9-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3516,16 +3516,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT:    s_mov_b32 s0, s1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
@@ -3533,11 +3533,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX942-LABEL: store_load_vindex_large_offset_foo:
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX942-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX942-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
+; GFX942-NEXT:    s_mov_b32 s0, s1
+; GFX942-NEXT:    v_lshl_add_u32 v1, v0, 2, s0
 ; GFX942-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX942-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX942-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
@@ -3940,12 +3940,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
@@ -4001,15 +4001,15 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
 ; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
 ; GFX9-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
@@ -4020,11 +4020,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX942:       ; %bb.0: ; %bb
 ; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x24
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_mov_b32 s1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX942-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX942-NEXT:    v_mov_b32_e32 v1, 15
+; GFX942-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX942-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 60ac0b943faf4..29163c111fc5e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8894,6 +8894,501 @@ define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) {
   ret double %med
 }
 
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum(float %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-GISEL-NEXT:    v_max_f32_e32 v1, 2.0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, 2.0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximum_minimum:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_f32_e32 v1, 2.0, v0
+; GFX11-GISEL-NEXT:    v_cmp_o_f32_e32 vcc_lo, 2.0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX11-GISEL-NEXT:    v_cmp_o_f32_e32 vcc_lo, 4.0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan float %a, 1.0
+  %max = call float @llvm.maximum.f32(float %a.add, float 2.0)
+  %med = call float @llvm.minimum.f32(float %max, float 4.0)
+  ret float %med
+}
+
+define <2 x half> @v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum(<2 x half> %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT:    v_med3_f32 v1, v1, 2.0, 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_max_f32_e32 v3, 2.0, v0
+; SI-GISEL-NEXT:    v_max_f32_e32 v4, 2.0, v1
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-GISEL-NEXT:    v_min_f32_e32 v3, 4.0, v0
+; SI-GISEL-NEXT:    v_min_f32_e32 v4, 4.0, v1
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-SDAG-NEXT:    v_add_f16_e32 v1, 1.0, v0
+; VI-SDAG-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_max_f16_e32 v0, 2.0, v0
+; VI-SDAG-NEXT:    v_max_f16_e32 v1, 2.0, v1
+; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-SDAG-NEXT:    v_min_f16_e32 v1, 4.0, v1
+; VI-SDAG-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f16_e32 v1, 1.0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; VI-GISEL-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; VI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v1
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v1
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[4:5], 2.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v3, v1, 16, v3
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, 4.0 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[4:5], 4.0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, 4.0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v3, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_v2f16_maximum_minimum:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_max_f16 v1, v0, 2.0 op_sel_hi:[1,0]
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e64 s0, 2.0, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v1.l, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.h, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_min_f16 v1, v0, 4.0 op_sel_hi:[1,0]
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e64 s0, 4.0, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v1.l, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.h, s0
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan <2 x half> %a, splat (half 1.0)
+  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.add, <2 x half> splat (half 2.0))
+  %med = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max, <2 x half> splat (half 4.0))
+  ret <2 x half> %med
+}
+
+define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; SI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-SDAG-NEXT:    v_max_f16_e32 v0, 2.0, v0
+; VI-SDAG-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; VI-GISEL-NEXT:    v_max_f32_e32 v2, 2.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 2.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-GISEL-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; VI-GISEL-NEXT:    v_cmp_o_f32_e32 vcc, 4.0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX9-SDAG-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, 2.0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, 2.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, 4.0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_f16 v0.l, v0.l, 2.0, 4.0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.h, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.l, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan half %a, 1.0
+  %max = call half @llvm.maximum.f16(half %a.add, half 2.0)
+  %med = call half @llvm.minimum.f16(half %max, half 4.0)
+  ret half %med
+}
+
+define double @v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum(double %a) {
+; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; SI-SDAG:       ; %bb.0:
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; SI-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; SI-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; SI-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; SI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 2.0, v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; SI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 4.0, v[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; VI-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; VI-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; VI-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; VI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 2.0, v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; VI-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; VI-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 4.0, v[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 2.0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, 4.0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
+; GFX11-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f64_maximum_minimum:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[0:1], 2.0
+; GFX11-GISEL-NEXT:    v_cmp_o_f64_e32 vcc_lo, 2.0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX11-GISEL-NEXT:    v_cmp_o_f64_e32 vcc_lo, 4.0, v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %a.add = fadd nnan double %a, 1.0
+  %max = call double @llvm.maximum.f64(double %a.add, double 2.0)
+  %med = call double @llvm.minimum.f64(double %max, double 4.0)
+  ret double %med
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.minnum.f32(float, float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
index b5f0b2ff9ef4c..61902b5fd4661 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-flags-stack-offsets.ll
@@ -18,8 +18,8 @@ define void @gep_noflags_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_noflags_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -45,8 +45,8 @@ define void @gep_inbounds_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_inbounds_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -72,8 +72,8 @@ define void @gep_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_nuw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -99,8 +99,8 @@ define void @gep_nusw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_nusw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -126,8 +126,8 @@ define void @gep_inbounds_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_inbounds_nuw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -153,8 +153,8 @@ define void @gep_nusw_nuw_alloca(i32 %idx, i32 %val) #0 {
 ; GFX9-LABEL: gep_nusw_nuw_alloca:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 6, s32
-; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX9-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, s4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
index 8bd6c0f2652cf..d24b3a23cb9cd 100644
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -22,7 +22,7 @@ define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) {
 }
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_1:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0xc00
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) {
 entry:
@@ -49,7 +49,7 @@ endif:                                            ; preds = %else, %if
 }
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_2:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x4000
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -64,7 +64,7 @@ define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
 ; The offset to the dynamic shared memory array should be aligned on the type
 ; specified.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_3:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -80,7 +80,7 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
 ; The offset to the dynamic shared memory array should be aligned on the
 ; maximal one.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_4:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x48
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
@@ -99,7 +99,7 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_5:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x44
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
@@ -118,7 +118,7 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
 
 ; Honor the explicit alignment from the specified variable.
 ; CHECK-LABEL: {{^}}dynamic_shared_array_6:
-; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
+; CHECK: s_movk_i32 [[DYNLDS:s[0-9]+]], 0x50
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
 ; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
 define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 3eef616ba267d..ad894ce36c55b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -97,8 +97,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 4
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -136,10 +135,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 4
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
@@ -163,8 +161,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 4
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -202,10 +199,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
@@ -229,8 +225,7 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -268,10 +263,9 @@ define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
@@ -295,8 +289,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
-; CHECK-NEXT:    s_add_i32 s0, s0, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s0, s0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    ds_write_b16 v1, v0
 ; CHECK-NEXT:    ds_write_b32 v2, v1
@@ -334,10 +327,9 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[20:21]
-; CHECK-NEXT:    s_lshl_b32 s4, s17, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    s_lshl2_add_u32 s4, s17, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s4
 ; CHECK-NEXT:    ds_write_b16 v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 3c55dcb486675..447cb62643384 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -6,8 +6,8 @@
 
 ; ELF:      Relocations [
 ; ELF-NEXT:   Section (3) .rel.text {
-; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.external
-; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32_LO lds.defined
+; ELF-NEXT:     0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.external
+; ELF-NEXT:     0x{{[0-9A-F]*}} R_AMDGPU_ABS32_LO lds.defined
 ; ELF-NEXT:   }
 ; ELF-NEXT: ]
 
@@ -32,10 +32,10 @@
 ; ELF-NEXT: }
 
 ; GCN-LABEL: {{^}}test_basic:
-; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
+; GCN: s_mov_b32 s0, lds.external@abs32@lo ; encoding: [0xff,0x00,0x80,0xbe,A,A,A,A]
 ; GCN-NEXT:              ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
 ;
-; GCN: s_add_i32 s0, s0, lds.defined@abs32@lo ; encoding: [0x00,0xff,0x00,0x81,A,A,A,A]
+; GCN: s_lshl2_add_u32 s0, s2, lds.defined@abs32@lo ; encoding: [0x02,0xff,0x80,0x97,A,A,A,A]
 ; GCN-NEXT:          ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
 ;
 ; GCN: .globl lds.external
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
index 69439d49e588f..de82dcdecda48 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll
@@ -102,10 +102,9 @@ define void @test_workgroup_id_x_non_kernel_optimized_fixed(ptr addrspace(1) %ou
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp6, 15
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s1, s0
+; GFX1250-SDAG-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
index 497241cff392d..6b6658bd672de 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -234,19 +234,18 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_optimized:
 ; GFX1250-SDAG:       ; %bb.0: ; %.entry
-; GFX1250-SDAG-NEXT:    s_lshl_b32 s0, ttmp9, 1
-; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
-; GFX1250-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 14
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
-; GFX1250-SDAG-NEXT:    s_and_b32 s0, s2, 0x3fffc
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 14
 ; GFX1250-SDAG-NEXT:    s_and_b32 s2, ttmp7, 0xffff
+; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_and_b32 s1, s1, 0x3fffc
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s3, ttmp6, 0x40008
 ; GFX1250-SDAG-NEXT:    s_mul_i32 s2, s2, 3
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s4, ttmp6, 0x40004
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s0
+; GFX1250-SDAG-NEXT:    s_lshl1_add_u32 s0, ttmp9, s0
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, s1
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX1250-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
 ; GFX1250-SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 52671f5d3deb4..c4e77c457db3a 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1622,15 +1622,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_1-NEXT:    v_writelane_b32 v1, s55, 0
 ; GFX10_1-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_1-NEXT:    s_lshl_b32 s4, s16, 2
-; GFX10_1-NEXT:    s_lshr_b32 s55, s32, 5
-; GFX10_1-NEXT:    s_add_i32 s55, s55, s4
+; GFX10_1-NEXT:    s_lshr_b32 s4, s32, 5
+; GFX10_1-NEXT:    s_addk_i32 s4, 0x4040
+; GFX10_1-NEXT:    s_lshl2_add_u32 s55, s16, s4
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
-; GFX10_1-NEXT:    s_addk_i32 s55, 0x4040
+; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
 ; GFX10_1-NEXT:    ;;#ASMEND
-; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s55, scc
 ; GFX10_1-NEXT:    ;;#ASMEND
@@ -1652,15 +1651,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10_3-NEXT:    v_writelane_b32 v1, s55, 0
 ; GFX10_3-NEXT:    v_lshrrev_b32_e64 v0, 5, s32
-; GFX10_3-NEXT:    s_lshl_b32 s4, s16, 2
-; GFX10_3-NEXT:    s_lshr_b32 s55, s32, 5
-; GFX10_3-NEXT:    s_add_i32 s55, s55, s4
+; GFX10_3-NEXT:    s_lshr_b32 s4, s32, 5
+; GFX10_3-NEXT:    s_addk_i32 s4, 0x4040
+; GFX10_3-NEXT:    s_lshl2_add_u32 s55, s16, s4
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
-; GFX10_3-NEXT:    s_addk_i32 s55, 0x4040
+; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
 ; GFX10_3-NEXT:    ;;#ASMEND
-; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s55, scc
 ; GFX10_3-NEXT:    ;;#ASMEND
@@ -1681,13 +1679,13 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v1, s55, 0
 ; GFX11-NEXT:    s_add_i32 s1, s32, 64
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    s_add_i32 s55, s32, s0
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4040
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use alloca0 v0
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    s_addk_i32 s55, 0x4040
+; GFX11-NEXT:    s_lshl2_add_u32 s55, s0, s1
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s55, scc
@@ -1712,16 +1710,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_add_co_i32 s1, s32, 0x4000
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s32
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_add_co_i32 s55, s32, s0
+; GFX12-NEXT:    s_lshl2_add_u32 s55, s0, s1
+; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use alloca0 v0
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_addk_co_i32 s55, 0x4000
-; GFX12-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s55, scc
 ; GFX12-NEXT:    ;;#ASMEND
@@ -1770,10 +1766,9 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX900-NEXT:    s_lshl_b32 s4, s16, 2
-; GFX900-NEXT:    s_lshr_b32 s55, s32, 6
-; GFX900-NEXT:    s_add_i32 s55, s55, s4
-; GFX900-NEXT:    s_addk_i32 s55, 0x4040
+; GFX900-NEXT:    s_lshr_b32 s4, s32, 6
+; GFX900-NEXT:    s_addk_i32 s4, 0x4040
+; GFX900-NEXT:    s_lshl2_add_u32 s55, s16, s4
 ; GFX900-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; GFX900-NEXT:    v_add_u32_e32 v0, 64, v0
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -1799,9 +1794,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse
 ; GFX942-NEXT:    scratch_store_dword off, v1, s1 ; 4-byte Folded Spill
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    v_writelane_b32 v1, s55, 0
-; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-NEXT:    s_add_i32 s55, s32, s0
-; GFX942-NEXT:    s_addk_i32 s55, 0x4040
+; GFX942-NEXT:    s_add_i32 s1, s32, 0x4040
+; GFX942-NEXT:    s_lshl2_add_u32 s55, s0, s1
 ; GFX942-NEXT:    s_add_i32 s0, s32, 64
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index c6f7ce51f5ea2..9888204b997a9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -260,12 +260,11 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX10-WGP-LABEL: local_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -274,12 +273,11 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX10-CU-LABEL: local_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -311,15 +309,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -328,15 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -345,15 +339,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -362,15 +354,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-TGSPLIT-NEXT:    ds_read_b32 v1, v1
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -379,14 +369,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX11-WGP-LABEL: local_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -395,14 +384,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX11-CU-LABEL: local_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -411,15 +399,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX12-WGP-LABEL: local_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -428,15 +414,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX12-CU-LABEL: local_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -445,14 +429,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ; GFX1250-LABEL: local_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX1250-NEXT:    s_mov_b32 s2, 2
+; GFX1250-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX1250-NEXT:    ds_load_b32 v1, v1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -679,12 +662,11 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: local_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -692,12 +674,11 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: local_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -720,15 +701,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -736,15 +715,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -752,15 +729,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -768,15 +743,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-TGSPLIT-NEXT:    ds_write_b32 v0, v1
@@ -784,14 +757,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX11-WGP-LABEL: local_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
@@ -799,14 +771,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX11-CU-LABEL: local_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
@@ -814,15 +785,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX12-WGP-LABEL: local_nontemporal_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
@@ -830,15 +799,13 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX12-CU-LABEL: local_nontemporal_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-CU-NEXT:    s_mov_b32 s1, 2
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
@@ -846,15 +813,14 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: local_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX1250-NEXT:    s_mov_b32 s1, 2
-; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1250-NEXT:    ds_store_b32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index d686e7a2d5b4c..33c516c61e42c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -208,12 +208,11 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX10-WGP-LABEL: local_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -222,12 +221,11 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX10-CU-LABEL: local_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -259,14 +257,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX11-WGP-LABEL: local_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -275,14 +272,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX11-CU-LABEL: local_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -291,15 +287,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX12-WGP-LABEL: local_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
 ; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -308,15 +302,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX12-CU-LABEL: local_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX12-CU-NEXT:    s_mov_b32 s2, 2
+; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -325,14 +317,13 @@ define amdgpu_kernel void @local_volatile_load_1(
 ; GFX1250-LABEL: local_volatile_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX1250-NEXT:    s_mov_b32 s2, 2
+; GFX1250-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX1250-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX1250-NEXT:    ds_load_b32 v1, v1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -511,12 +502,11 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: local_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
@@ -524,12 +514,11 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: local_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
@@ -552,14 +541,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX11-WGP-LABEL: local_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
@@ -567,14 +555,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX11-CU-LABEL: local_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
@@ -582,15 +569,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX12-WGP-LABEL: local_volatile_store_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
@@ -598,15 +583,13 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX12-CU-LABEL: local_volatile_store_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
-; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX12-CU-NEXT:    s_mov_b32 s1, 2
-; GFX12-CU-NEXT:    s_wait_alu 0xfffe
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
@@ -614,15 +597,14 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX1250-LABEL: local_volatile_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX1250-NEXT:    s_mov_b32 s1, 2
-; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
+; GFX1250-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX1250-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1250-NEXT:    ds_store_b32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 89de17ecbd1e8..6c19722ad6e33 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -270,12 +270,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -286,12 +285,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -330,15 +328,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -349,15 +345,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s7, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s7
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -366,15 +360,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-NOTTGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -383,15 +375,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX942-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 2
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX942-TGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -400,14 +390,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX11-WGP-LABEL: private_nontemporal_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off slc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -416,14 +405,13 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ; GFX11-CU-LABEL: private_nontemporal_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off slc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -708,12 +696,11 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -723,12 +710,11 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -758,15 +744,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
+; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -776,15 +760,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
 ; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s17
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
-; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
-; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
-; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
-; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
+; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
+; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s6
+; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
@@ -792,15 +774,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX942-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX942-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
+; GFX942-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX942-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-NOTTGSPLIT-NEXT:    scratch_store_dword v1, v0, off nt
@@ -808,15 +788,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX942-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX942-TGSPLIT:       ; %bb.0: ; %entry
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX942-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX942-TGSPLIT-NEXT:    s_mov_b32 s1, 2
-; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
-; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
+; GFX942-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
+; GFX942-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX942-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX942-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-TGSPLIT-NEXT:    scratch_store_dword v1, v0, off nt
@@ -824,14 +802,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX11-WGP-LABEL: private_nontemporal_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
@@ -839,14 +816,13 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX11-CU-LABEL: private_nontemporal_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 7faa0621aa6d0..7c23b76cec3e9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -228,12 +228,11 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -244,12 +243,11 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-CU-NEXT:    s_mov_b32 s6, 2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s6
 ; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -286,14 +284,13 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX11-WGP-LABEL: private_volatile_load_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
+; GFX11-WGP-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off glc dlc
 ; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -302,14 +299,13 @@ define amdgpu_kernel void @private_volatile_load_1(
 ; GFX11-CU-LABEL: private_volatile_load_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
-; GFX11-CU-NEXT:    s_mov_b32 s2, 2
+; GFX11-CU-NEXT:    s_mov_b32 s3, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s3
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, 2, s2
 ; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off glc dlc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -578,12 +574,11 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX10-WGP:       ; %bb.0: ; %entry
 ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
-; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
@@ -594,12 +589,11 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
+; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX10-CU-NEXT:    s_mov_b32 s5, 2
-; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
+; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s5
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
@@ -629,14 +623,13 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX11-WGP-LABEL: private_volatile_store_1:
 ; GFX11-WGP:       ; %bb.0: ; %entry
-; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
-; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off dlc
@@ -645,14 +638,13 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX11-CU-LABEL: private_volatile_store_1:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
-; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
-; GFX11-CU-NEXT:    s_mov_b32 s1, 2
-; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
+; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
+; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s2
+; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, 2, s1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
new file mode 100644
index 0000000000000..b7e6ed26876c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshl_add.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_kernel void @lshl1_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl1_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s3, s4
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_mov_b32 s2, s1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v1, v1, 1, s0
+; CHECK-NEXT:    buffer_store_b16 v0, v1, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i16, ptr addrspace(7) %in2, i64 %1
+  store i16 0, ptr addrspace(7) %gep, align 2
+  ret void
+}
+
+define amdgpu_kernel void @lshl2_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl2_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s3, s4
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_mov_b32 s2, s1
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
+; CHECK-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i32, ptr addrspace(7) %in2, i64 %1
+  store i32 0, ptr addrspace(7) %gep, align 4
+  ret void
+}
+
+define amdgpu_kernel void @lshl3_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl3_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v2, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_mov_b32 s2, s1
+; CHECK-NEXT:    s_mov_b32 s3, s4
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v2, v2, 3, s0
+; CHECK-NEXT:    buffer_store_b64 v[0:1], v2, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i64, ptr addrspace(7) %in2, i64 %1
+  store i64 0, ptr addrspace(7) %gep, align 8
+  ret void
+}
+
+define amdgpu_kernel void @lshl4_add(ptr addrspace(1) %in, ptr addrspace(7) %in2) {
+; CHECK-LABEL: lshl4_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
+; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_b32 s5, s[4:5], 0x54
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    s_mov_b32 s9, s4
+; CHECK-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    global_load_b32 v3, v0, s[6:7] scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s6, s3
+; CHECK-NEXT:    s_mov_b32 s8, s1
+; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s5, s2
+; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; CHECK-NEXT:    v_lshl_add_u32 v4, v3, 4, s0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    buffer_store_b128 v[0:3], v4, s[4:7], null offen
+; CHECK-NEXT:    s_endpgm
+  %vaddr = load volatile i32, ptr addrspace(1) %in, align 4
+  %1 = sext i32 %vaddr to i64
+  %gep = getelementptr i128, ptr addrspace(7) %in2, i64 %1
+  store i128 0, ptr addrspace(7) %gep, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index 6e2d0f6503a20..7e2bfa666a19f 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -144,7 +144,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
 ; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
 ; GFX9: global_load_dword [[VADDR:v[0-9]+]],
-; GFX9: v_lshlrev_b32_e32 [[ADDR:v[0-9]+]], 2, [[VADDR]]
+; GFX9: v_lshl_add_u32 [[ADDR:v[0-9]+]], [[VADDR]], 2, s{{[0-9]+}}
 ; GFX9-NOT [[ADDR]]
 ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index bac460949d579..a2c86bd09404a 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -33,11 +33,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
-; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
 ; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
 ; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s6
 ; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
-; MUBUF-NEXT:    s_add_i32 s6, s6, s7
+; MUBUF-NEXT:    s_lshl2_add_u32 s6, s10, s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -68,10 +67,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
 ; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
-; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
+; FLATSCR-NEXT:    s_lshl2_add_u32 s2, s6, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -132,12 +130,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0xfff
 ; MUBUF-NEXT:    s_and_b32 s4, s4, 0xfffff000
-; MUBUF-NEXT:    s_lshl_b32 s5, s5, 2
 ; MUBUF-NEXT:    s_add_i32 s32, s4, 0x1000
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
-; MUBUF-NEXT:    s_add_i32 s4, s4, s5
+; MUBUF-NEXT:    s_lshl2_add_u32 s4, s5, s4
 ; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
@@ -168,10 +165,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    s_lshl_b32 s1, s1, 2
 ; FLATSCR-NEXT:    s_add_i32 s32, s0, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
-; FLATSCR-NEXT:    s_add_i32 s0, s0, s1
+; FLATSCR-NEXT:    s_lshl2_add_u32 s0, s1, s0
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s0
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
index 3e507a0c5889f..ba8ae9554d0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
@@ -14,8 +14,7 @@
 define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl1_add_u32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s1
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl1_add_u32:
@@ -26,8 +25,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ;
 ; GFX10-SDAG-LABEL: s_shl1_add_u32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s1
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl1_add_u32:
@@ -53,8 +51,7 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
 define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl2_add_u32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s1
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl2_add_u32:
@@ -65,8 +62,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ;
 ; GFX10-SDAG-LABEL: s_shl2_add_u32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s1
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl2_add_u32:
@@ -92,8 +88,7 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
 define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl3_add_u32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s1
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl3_add_u32:
@@ -104,8 +99,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ;
 ; GFX10-SDAG-LABEL: s_shl3_add_u32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s1
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl3_add_u32:
@@ -131,8 +125,7 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
 define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl4_add_u32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s1
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl4_add_u32:
@@ -143,8 +136,7 @@ define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
 ;
 ; GFX10-SDAG-LABEL: s_shl4_add_u32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s1
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl4_add_u32:
@@ -598,10 +590,8 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
 define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl1_add_u32_v2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-SDAG-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX9-SDAG-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT:    s_lshl1_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s2
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl1_add_u32_v2:
@@ -614,10 +604,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 ;
 ; GFX10-SDAG-LABEL: s_shl1_add_u32_v2:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-SDAG-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT:    s_lshl1_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl1_add_u32 s1, s1, s3
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl1_add_u32_v2:
@@ -647,10 +635,8 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl2_add_u32_v2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX9-SDAG-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl2_add_u32_v2:
@@ -663,10 +649,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 ;
 ; GFX10-SDAG-LABEL: s_shl2_add_u32_v2:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s1, s1, s3
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl2_add_u32_v2:
@@ -696,10 +680,8 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl3_add_u32_v2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX9-SDAG-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX9-SDAG-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT:    s_lshl3_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s2
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl3_add_u32_v2:
@@ -712,10 +694,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 ;
 ; GFX10-SDAG-LABEL: s_shl3_add_u32_v2:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX10-SDAG-NEXT:    s_lshl_b32 s1, s1, 3
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT:    s_lshl3_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl3_add_u32 s1, s1, s3
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl3_add_u32_v2:
@@ -745,10 +725,8 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl4_add_u32_v2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX9-SDAG-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-SDAG-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s2
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl4_add_u32_v2:
@@ -761,10 +739,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 ;
 ; GFX10-SDAG-LABEL: s_shl4_add_u32_v2:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 4
-; GFX10-SDAG-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl4_add_u32_v2:
@@ -794,10 +770,8 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
 define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
 ; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX9-SDAG-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX9-SDAG-NEXT:    s_add_i32 s1, s1, s3
-; GFX9-SDAG-NEXT:    s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
+; GFX9-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
 ; GFX9-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2:
@@ -810,10 +784,8 @@ define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32
 ;
 ; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-SDAG-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX10-SDAG-NEXT:    s_add_i32 s0, s0, s2
-; GFX10-SDAG-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT:    s_lshl2_add_u32 s0, s0, s2
+; GFX10-SDAG-NEXT:    s_lshl4_add_u32 s1, s1, s3
 ; GFX10-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_2_4_add_u32_v2:
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 5aafb0f576fb4..90304b2c730cb 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -69,6 +69,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
   ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
@@ -92,7 +93,6 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
@@ -101,6 +101,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
@@ -113,10 +114,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LSHL4_ADD_U32_:%[0-9]+]]:sreg_32 = S_LSHL4_ADD_U32 [[COPY12]], 16, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %383:sgpr_128, [[S_LSHL4_ADD_U32_]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
@@ -127,25 +126,25 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -233, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM5]], -249, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM3]], -297, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_10:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -313, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_3]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_3]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
@@ -164,11 +163,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -185,11 +184,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
@@ -198,32 +197,32 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
-  ; CHECK-NEXT:   KILL undef %470:sreg_64
+  ; CHECK-NEXT:   KILL undef %469:sreg_64
   ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
-  ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
-  ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
-  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
   ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
   ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
@@ -236,10 +235,10 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
+  ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
-  ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
   ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -310,15 +309,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -216, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_36:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_35]], [[V_ADD_U32_e64_17]], implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_37:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_36]], [[V_ADD_U32_e64_18]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_37]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_38]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_39]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_40]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_41]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_42]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_43]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_38:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_6]], [[V_OR_B32_e64_37]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_39:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_7]], [[V_OR_B32_e64_38]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_40:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_8]], [[V_OR_B32_e64_39]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_41:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_9]], [[V_OR_B32_e64_40]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_42:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_10]], [[V_OR_B32_e64_41]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_43:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_11]], [[V_OR_B32_e64_42]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_44:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_12]], [[V_OR_B32_e64_43]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -457, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_44]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_45:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_13]], [[V_OR_B32_e64_44]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -458, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_46:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_45]], [[V_ADD_U32_e64_19]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -459, [[BUFFER_LOAD_FORMAT_X_IDXEN21]], 0, implicit $exec
@@ -326,15 +325,15 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -466, [[BUFFER_LOAD_FORMAT_X_IDXEN22]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_48:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_47]], [[V_ADD_U32_e64_21]], implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_49:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_48]], [[V_ADD_U32_e64_22]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_49]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_50]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_51]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_52]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_53]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_54]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_55]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_56]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_57]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_50:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_14]], [[V_OR_B32_e64_49]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_51:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_15]], [[V_OR_B32_e64_50]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_52:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_16]], [[V_OR_B32_e64_51]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_53:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_22]], [[V_OR_B32_e64_52]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_54:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_17]], [[V_OR_B32_e64_53]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_55:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_18]], [[V_OR_B32_e64_54]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_56:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_19]], [[V_OR_B32_e64_55]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_57:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_20]], [[V_OR_B32_e64_56]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_58:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_21]], [[V_OR_B32_e64_57]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -555, [[BUFFER_LOAD_FORMAT_X_IDXEN23]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -556, [[BUFFER_LOAD_FORMAT_X_IDXEN24]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_59:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_58]], [[V_ADD_U32_e64_23]], implicit $exec
@@ -351,13 +350,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %542:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
-  ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
+  ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_23]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
   ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %556:vgpr_32, undef %558:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 21f0c008366a9..0fdc1a83dddbd 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2029,10 +2029,10 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
+; GFX9-W64-NEXT:    s_mov_b32 s2, 0
 ; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, v1
+; GFX9-W64-NEXT:    v_lshl_add_u32 v1, v2, 2, s2
 ; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
index b24ea9ec1561e..3c617f9854761 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
@@ -32,9 +32,12 @@ define void @constant_fold_barrier_i128(ptr %p) {
 ; RV32-NEXT:    mv a6, a1
 ; RV32-NEXT:    seqz a7, a1
 ; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    mv a7, a1
 ; RV32-NEXT:    seqz a3, a1
 ; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    mv a1, a1
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a6, 4(a0)
 ; RV32-NEXT:    sw a7, 8(a0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index 225ceed9627b7..5f61ee2d02d24 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -103,15 +103,18 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
 ; RV32-NEXT:    mulhu a1, a1, a2
 ; RV32-NEXT:    add a5, a5, a6
 ; RV32-NEXT:    mv t0, t1
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    sltu a4, a5, a6
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    sltu a6, t1, t1
 ; RV32-NEXT:    sltiu t1, t1, 0
 ; RV32-NEXT:    add t0, t0, t2
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    sltu a2, a5, a7
 ; RV32-NEXT:    add a6, a6, t1
 ; RV32-NEXT:    sltu a5, t0, t2
 ; RV32-NEXT:    add t0, t0, a0
+; RV32-NEXT:    mv a1, a1
 ; RV32-NEXT:    add a2, a4, a2
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    sltu a0, t0, a0
@@ -155,6 +158,7 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
 ; RV32-NEXT:    mulhu a7, a0, a2
 ; RV32-NEXT:    mulhu t2, a1, a3
 ; RV32-NEXT:    mv t1, t2
+; RV32-NEXT:    mv t1, t1
 ; RV32-NEXT:    mul t2, a1, a3
 ; RV32-NEXT:    mulhu a2, a1, a2
 ; RV32-NEXT:    mulhu a3, a0, a3
diff --git a/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir b/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir
index d739537b50d05..293b15bf9d25e 100644
--- a/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir
+++ b/llvm/test/CodeGen/RISCV/machine-copyprop-noop-removal.mir
@@ -1,8 +1,11 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -o - %s -mtriple=riscv64 -run-pass=machine-cp -mcp-use-is-copy-instr | FileCheck %s
 
-## This test was added to capture a case where MachineCopyPropagation risks
-## leaving a no-op register move (add, x0, reg).
+## This test was added to capture a case where MachineCopyPropagation may
+## leave a no-op register move (add reg, x0, reg).
+## Due to the bug reported in
+## <https://github.com/llvm/llvm-project/issues/166870>, we are not currently
+## able to optimize this case.
 
 ---
 name: ham
@@ -21,6 +24,7 @@ body: |
   ; CHECK-NEXT:   liveins: $x10
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x11 = ADDI $x0, 0
+  ; CHECK-NEXT:   renamable $x10 = ADDI killed renamable $x10, 0
   ; CHECK-NEXT:   BEQ renamable $x10, $x0, %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index b155feab9b4d9..9f326280885b5 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1352,6 +1352,7 @@ define signext i32 @sextw_sh2add(i1 zeroext %0, ptr %1, i32 signext %2, i32 sign
 ; NOREMOVAL-LABEL: sextw_sh2add:
 ; NOREMOVAL:       # %bb.0:
 ; NOREMOVAL-NEXT:    sh2add a2, a2, a3
+; NOREMOVAL-NEXT:    mv a2, a2
 ; NOREMOVAL-NEXT:    beqz a0, .LBB22_2
 ; NOREMOVAL-NEXT:  # %bb.1:
 ; NOREMOVAL-NEXT:    sw a2, 0(a1)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 324a0c49fb413..d698fad745dfb 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1992,6 +1992,389 @@ define <8 x i16> @avgr_u_v8i16_zext(<8 x i16> %x, <8 x i16> %y) {
   %c.trunc = trunc <8 x i32> %c to <8 x i16>
   ret <8 x i16> %c.trunc
 }
+define void @avgr_undef_shuffle_lanes(ptr %res, <8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; SIMD128-LABEL: avgr_undef_shuffle_lanes:
+; SIMD128:         .functype avgr_undef_shuffle_lanes (i32, v128, v128, v128, v128) -> ()
+; SIMD128-NEXT:  # %bb.0:
+; SIMD128-NEXT:    i8x16.avgr_u $push1=, $1, $2
+; SIMD128-NEXT:    i8x16.shuffle $push12=, $pop1, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-NEXT:    local.tee $push11=, $2=, $pop12
+; SIMD128-NEXT:    i8x16.avgr_u $push0=, $3, $4
+; SIMD128-NEXT:    i8x16.shuffle $push10=, $pop0, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-NEXT:    local.tee $push9=, $4=, $pop10
+; SIMD128-NEXT:    i8x16.shuffle $push4=, $pop11, $pop9, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+; SIMD128-NEXT:    v128.const $push8=, 255, 255, 255, 255, 255, 255, 255, 255
+; SIMD128-NEXT:    local.tee $push7=, $3=, $pop8
+; SIMD128-NEXT:    v128.and $push5=, $pop4, $pop7
+; SIMD128-NEXT:    i8x16.shuffle $push2=, $2, $4, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+; SIMD128-NEXT:    v128.and $push3=, $pop2, $3
+; SIMD128-NEXT:    i8x16.narrow_i16x8_u $push6=, $pop5, $pop3
+; SIMD128-NEXT:    v128.store 0($0):p2align=0, $pop6
+; SIMD128-NEXT:    return
+;
+; SIMD128-FAST-LABEL: avgr_undef_shuffle_lanes:
+; SIMD128-FAST:         .functype avgr_undef_shuffle_lanes (i32, v128, v128, v128, v128) -> ()
+; SIMD128-FAST-NEXT:  # %bb.0:
+; SIMD128-FAST-NEXT:    i8x16.avgr_u $push1=, $1, $2
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push12=, $pop1, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-FAST-NEXT:    local.tee $push11=, $2=, $pop12
+; SIMD128-FAST-NEXT:    i8x16.avgr_u $push0=, $3, $4
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push10=, $pop0, $4, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; SIMD128-FAST-NEXT:    local.tee $push9=, $4=, $pop10
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push4=, $pop11, $pop9, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
+; SIMD128-FAST-NEXT:    v128.const $push8=, 255, 255, 255, 255, 255, 255, 255, 255
+; SIMD128-FAST-NEXT:    local.tee $push7=, $3=, $pop8
+; SIMD128-FAST-NEXT:    v128.and $push5=, $pop4, $pop7
+; SIMD128-FAST-NEXT:    i8x16.shuffle $push2=, $2, $4, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
+; SIMD128-FAST-NEXT:    v128.and $push3=, $pop2, $3
+; SIMD128-FAST-NEXT:    i8x16.narrow_i16x8_u $push6=, $pop5, $pop3
+; SIMD128-FAST-NEXT:    v128.store 0($0):p2align=0, $pop6
+; SIMD128-FAST-NEXT:    return
+;
+; NO-SIMD128-LABEL: avgr_undef_shuffle_lanes:
+; NO-SIMD128:         .functype avgr_undef_shuffle_lanes (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-NEXT:    i32.and $push2=, $24, $pop0
+; NO-SIMD128-NEXT:    i32.const $push143=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop143
+; NO-SIMD128-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-NEXT:    i32.const $push142=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop142
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push141=, 255
+; NO-SIMD128-NEXT:    i32.and $push8=, $8, $pop141
+; NO-SIMD128-NEXT:    i32.const $push140=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $16, $pop140
+; NO-SIMD128-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.const $push139=, 1
+; NO-SIMD128-NEXT:    i32.add $push10=, $pop9, $pop139
+; NO-SIMD128-NEXT:    i32.const $push138=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push11=, $pop10, $pop138
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push137=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $23, $pop137
+; NO-SIMD128-NEXT:    i32.const $push136=, 255
+; NO-SIMD128-NEXT:    i32.and $push12=, $31, $pop136
+; NO-SIMD128-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.const $push135=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop135
+; NO-SIMD128-NEXT:    i32.const $push134=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $pop134
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push133=, 255
+; NO-SIMD128-NEXT:    i32.and $push18=, $7, $pop133
+; NO-SIMD128-NEXT:    i32.const $push132=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $15, $pop132
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.const $push131=, 1
+; NO-SIMD128-NEXT:    i32.add $push20=, $pop19, $pop131
+; NO-SIMD128-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop130
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push129=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop129
+; NO-SIMD128-NEXT:    i32.const $push128=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $30, $pop128
+; NO-SIMD128-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-NEXT:    i32.add $push25=, $pop24, $pop127
+; NO-SIMD128-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $pop126
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push125=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $6, $pop125
+; NO-SIMD128-NEXT:    i32.const $push124=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop124
+; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-NEXT:    i32.add $push30=, $pop29, $pop123
+; NO-SIMD128-NEXT:    i32.const $push122=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop122
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop31
+; NO-SIMD128-NEXT:    i32.const $push121=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $21, $pop121
+; NO-SIMD128-NEXT:    i32.const $push120=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $29, $pop120
+; NO-SIMD128-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.const $push119=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop119
+; NO-SIMD128-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop118
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $5, $pop117
+; NO-SIMD128-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $13, $pop116
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-NEXT:    i32.add $push40=, $pop39, $pop115
+; NO-SIMD128-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop114
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop41
+; NO-SIMD128-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $20, $pop113
+; NO-SIMD128-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $28, $pop112
+; NO-SIMD128-NEXT:    i32.add $push44=, $pop43, $pop42
+; NO-SIMD128-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop111
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push46=, $pop45, $pop110
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop46
+; NO-SIMD128-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-NEXT:    i32.and $push48=, $4, $pop109
+; NO-SIMD128-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $12, $pop108
+; NO-SIMD128-NEXT:    i32.add $push49=, $pop48, $pop47
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push50=, $pop49, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop51
+; NO-SIMD128-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop105
+; NO-SIMD128-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-NEXT:    i32.and $push52=, $27, $pop104
+; NO-SIMD128-NEXT:    i32.add $push54=, $pop53, $pop52
+; NO-SIMD128-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop103
+; NO-SIMD128-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push56=, $pop55, $pop102
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $3, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $11, $pop100
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-NEXT:    i32.add $push60=, $pop59, $pop99
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop98
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop61
+; NO-SIMD128-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-NEXT:    i32.and $push63=, $18, $pop97
+; NO-SIMD128-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $26, $pop96
+; NO-SIMD128-NEXT:    i32.add $push64=, $pop63, $pop62
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push65=, $pop64, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push66=, $pop65, $pop94
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop66
+; NO-SIMD128-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-NEXT:    i32.and $push68=, $2, $pop93
+; NO-SIMD128-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-NEXT:    i32.and $push67=, $10, $pop92
+; NO-SIMD128-NEXT:    i32.add $push69=, $pop68, $pop67
+; NO-SIMD128-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-NEXT:    i32.add $push70=, $pop69, $pop91
+; NO-SIMD128-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push71=, $pop70, $pop90
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop71
+; NO-SIMD128-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-NEXT:    i32.and $push73=, $17, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-NEXT:    i32.and $push72=, $25, $pop88
+; NO-SIMD128-NEXT:    i32.add $push74=, $pop73, $pop72
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.add $push75=, $pop74, $pop87
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push76=, $pop75, $pop86
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop76
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push78=, $1, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push77=, $9, $pop84
+; NO-SIMD128-NEXT:    i32.add $push79=, $pop78, $pop77
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push80=, $pop79, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push81=, $pop80, $pop82
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop81
+; NO-SIMD128-NEXT:    return
+;
+; NO-SIMD128-FAST-LABEL: avgr_undef_shuffle_lanes:
+; NO-SIMD128-FAST:         .functype avgr_undef_shuffle_lanes (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
+; NO-SIMD128-FAST-NEXT:  # %bb.0:
+; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.const $push143=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop143
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $pop2, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $pop3, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push142=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop142
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push141=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $1, $pop141
+; NO-SIMD128-FAST-NEXT:    i32.const $push140=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $9, $pop140
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push139=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $pop9, $pop139
+; NO-SIMD128-FAST-NEXT:    i32.const $push138=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push11=, $pop10, $pop138
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push137=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $18, $pop137
+; NO-SIMD128-FAST-NEXT:    i32.const $push136=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $26, $pop136
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push135=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop135
+; NO-SIMD128-FAST-NEXT:    i32.const $push134=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $pop134
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $2, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $10, $pop132
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $pop19, $pop131
+; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $19, $pop129
+; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $27, $pop128
+; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $pop24, $pop127
+; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $pop126
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $3, $pop125
+; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $11, $pop124
+; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $pop28, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $pop29, $pop123
+; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push31=, $pop30, $pop122
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $20, $pop121
+; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $28, $pop120
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop119
+; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop118
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $4, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $12, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $pop39, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop114
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $21, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $29, $pop112
+; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $pop43, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $pop44, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push46=, $pop45, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $5, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $13, $pop108
+; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $pop49, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $22, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $30, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop103
+; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push56=, $pop55, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $6, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $pop59, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $23, $pop97
+; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $31, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $pop63, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $pop64, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push66=, $pop65, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $7, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $15, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $pop68, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $pop69, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push71=, $pop70, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push73=, $24, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push72=, $32, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $pop73, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push75=, $pop74, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push76=, $pop75, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push78=, $8, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $16, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $pop79, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop81
+; NO-SIMD128-FAST-NEXT:    return
+  %zext.0 = zext <8 x i8> %a to <8 x i16>
+  %zext.1 = zext <8 x i8> %b to <8 x i16>
+  %add.0 = add nuw nsw <8 x i16> %zext.0, splat (i16 1)
+  %add.1 = add nuw nsw <8 x i16> %add.0, %zext.1
+  %shift.0 = lshr <8 x i16> %add.1, splat (i16 1)
+  %zext.2 = zext <8 x i8> %c to <8 x i16>
+  %zext.3 = zext <8 x i8> %d to <8 x i16>
+  %add.2 = add nuw nsw <8 x i16> %zext.2, splat (i16 1)
+  %add.3 = add nuw nsw <8 x i16> %add.2, %zext.3
+  %shift.1 = lshr <8 x i16> %add.3, splat (i16 1)
+  %shuffle = shufflevector <8 x i16> %shift.0, <8 x i16> %shift.1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %trunc = trunc nuw <16 x i16> %shuffle to <16 x i8>
+  store <16 x i8> %trunc, ptr %res, align 1
+  ret void
+}
 define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; SIMD128:         .functype avgr_u_v16i8_wrap (v128, v128) -> (v128)
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
index 77053e2c1bc98..4dd883a24f623 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
@@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)
 ; CHECK-LABEL: gather_qps:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k2
 ; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1,
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
 define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
 ; CHECK-LABEL: scatter_mask_test:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    kxorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovd %eax, %k1
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index df71e3c3afa5e..5ed91ea1eb872 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8
 define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) {
 ; CHECK-LABEL: gather_qps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    kxnorw %k0, %k0, %k2
+; CHECK-NEXT:    kxnorb %k0, %k0, %k2
 ; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
 define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
 ; CHECK-LABEL: scatter_mask_test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    kxorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovd %eax, %k1
@@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b
 define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) {
 ; CHECK-LABEL: gather_global:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vgatherqps x(,%zmm0,4), %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
new file mode 100644
index 0000000000000..ca5f3192d7b97
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+
+; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ)
+define <8 x float> @mask_v8i1_allones(ptr %ptr) {
+; AVX512F-LABEL: mask_v8i1_allones:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v8i1_allones:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v8i1_allones:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v8i1_allones:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQBW-NEXT:    retq
+  %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer)
+  ret <8 x float> %res
+}
+
+; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ)
+define <16 x float> @mask_v16i1_lower8(ptr %ptr) {
+; AVX512F-LABEL: mask_v16i1_lower8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v16i1_lower8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v16i1_lower8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v16i1_lower8:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT:    retq
+  %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer)
+  ret <16 x float> %res
+}
+
+; Test case 3: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512-LABEL: gather_all:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison)
+  ret <16 x float> %res
+}
+
+; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets)
+define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_lower:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: gather_lower:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: gather_lower:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: gather_lower:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT:    retq
+  %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison)
+  ret <16 x float> %res
+}
+
+; Test case 5: v32i1 mask via bitconvert combined with dynamic condition.
+; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle.
+define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
+; AVX512F-LABEL: mask_v32i1_lower16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v32i1_lower16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v32i1_lower16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movl $65535, %eax # imm = 0xFFFF
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    vpcmpgtw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    kord %k0, %k1, %k1
+; AVX512BW-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v32i1_lower16:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorw %k0, %k0, %k0
+; AVX512DQBW-NEXT:    vpcmpgtw %zmm3, %zmm2, %k1
+; AVX512DQBW-NEXT:    kord %k0, %k1, %k1
+; AVX512DQBW-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512DQBW-NEXT:    retq
+  %mask0 = bitcast i32 65535 to <32 x i1>
+  %mask1 = icmp sgt <32 x i16> %c, %d
+  %mask = or <32 x i1> %mask0, %mask1
+  %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+; Test case 6: v64i1 mask via bitconvert combined with dynamic condition.
+; Verifies the KSET1D submask pattern survives past SelectionDAG combines.
+define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; AVX512F-LABEL: mask_v64i1_lower32:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask_v64i1_lower32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_v64i1_lower32:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; AVX512BW-NEXT:    kmovq %rax, %k0
+; AVX512BW-NEXT:    vpcmpgtb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT:    korq %k0, %k1, %k1
+; AVX512BW-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: mask_v64i1_lower32:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnord %k0, %k0, %k0
+; AVX512DQBW-NEXT:    vpcmpgtb %zmm3, %zmm2, %k1
+; AVX512DQBW-NEXT:    korq %k0, %k1, %k1
+; AVX512DQBW-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512DQBW-NEXT:    retq
+  %mask0 = bitcast i64 4294967295 to <64 x i1>
+  %mask1 = icmp sgt <64 x i8> %c, %d
+  %mask = or <64 x i1> %mask0, %mask1
+  %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b
+  ret <64 x i8> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index e9e9ee9c97593..9b7569ff8b29f 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -967,82 +967,63 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 16(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movzwl 14(%edx), %edi
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movzwl (%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 12(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzwl 14(%eax), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll $16, %edi
-; X86-NEXT:    movzwl 12(%edx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl 52(%esp,%eax), %edx
-; X86-NEXT:    movzbl 16(%ebp), %ecx
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movzwl 10(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $16, %ebx
-; X86-NEXT:    movzwl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl 48(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movzwl 6(%ecx), %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    movzwl 4(%ecx), %ecx
+; X86-NEXT:    movzwl 2(%eax), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movzbl 16(%ebp), %ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movzwl 2(%ecx), %edx
+; X86-NEXT:    movzwl 4(%eax), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $16, %edx
-; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl 6(%eax), %esi
+; X86-NEXT:    movzwl 8(%eax), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    movzwl 10(%eax), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    andb $96, %bl
+; X86-NEXT:    shrb $3, %bl
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    movl 32(%esp,%edi), %edi
+; X86-NEXT:    btcl %eax, %edi
+; X86-NEXT:    andl $96, %eax
+; X86-NEXT:    shrl $3, %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %edi, 12(%ecx)
-; X86-NEXT:    movl %ebx, 8(%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl %edi, (%ecx,%eax)
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movw %dx, 14(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movw %dx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movw %dx, 10(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movw %dx, 8(%eax)
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    movw %dx, 6(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movw %dx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 10(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 8(%eax)
+; X86-NEXT:    movw %si, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movw %cx, 4(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movw %cx, 2(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1056,81 +1037,57 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind
 ;
 ; SSE2-LABEL: complement_ne_i128_bitcast:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl %esi, %ecx
-; SSE2-NEXT:    movl $1, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    shldq %cl, %rax, %rdx
-; SSE2-NEXT:    xorl %esi, %esi
-; SSE2-NEXT:    shlq %cl, %rax
-; SSE2-NEXT:    testb $64, %cl
-; SSE2-NEXT:    cmovneq %rax, %rdx
-; SSE2-NEXT:    cmovneq %rsi, %rax
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    xorq %rdx, 8(%rdi)
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    movq 8(%rdi), %rax
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    movl %esi, %ecx
+; SSE2-NEXT:    andb $32, %cl
+; SSE2-NEXT:    shrdq %cl, %rax, %rdx
+; SSE2-NEXT:    shrq %cl, %rax
+; SSE2-NEXT:    testb $64, %sil
+; SSE2-NEXT:    cmoveq %rdx, %rax
+; SSE2-NEXT:    btcl %esi, %eax
+; SSE2-NEXT:    andl $96, %esi
+; SSE2-NEXT:    shrl $3, %esi
+; SSE2-NEXT:    movl %eax, (%rdi,%rsi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: complement_ne_i128_bitcast:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    movl %esi, %ecx
-; SSE4-NEXT:    movl $1, %eax
-; SSE4-NEXT:    xorl %edx, %edx
-; SSE4-NEXT:    shldq %cl, %rax, %rdx
-; SSE4-NEXT:    shlq %cl, %rax
-; SSE4-NEXT:    xorl %esi, %esi
-; SSE4-NEXT:    testb $64, %cl
-; SSE4-NEXT:    cmovneq %rax, %rdx
-; SSE4-NEXT:    cmovneq %rsi, %rax
+; SSE4-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSE4-NEXT:    movdqa (%rdi), %xmm0
-; SSE4-NEXT:    movq %xmm0, %rcx
-; SSE4-NEXT:    xorq %rax, %rcx
 ; SSE4-NEXT:    pextrq $1, %xmm0, %rax
-; SSE4-NEXT:    xorq %rdx, %rax
-; SSE4-NEXT:    movq %rax, 8(%rdi)
-; SSE4-NEXT:    movq %rcx, (%rdi)
+; SSE4-NEXT:    movq %xmm0, %rdx
+; SSE4-NEXT:    movl %esi, %ecx
+; SSE4-NEXT:    andb $32, %cl
+; SSE4-NEXT:    shrdq %cl, %rax, %rdx
+; SSE4-NEXT:    shrq %cl, %rax
+; SSE4-NEXT:    testb $64, %sil
+; SSE4-NEXT:    cmoveq %rdx, %rax
+; SSE4-NEXT:    btcl %esi, %eax
+; SSE4-NEXT:    andl $96, %esi
+; SSE4-NEXT:    shrl $3, %esi
+; SSE4-NEXT:    movl %eax, (%rdi,%rsi)
 ; SSE4-NEXT:    retq
 ;
-; AVX2-LABEL: complement_ne_i128_bitcast:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shlxq %rcx, %rax, %rax
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rcx
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX2-NEXT:    xorq %rax, %rcx
-; AVX2-NEXT:    xorq %rdx, %rsi
-; AVX2-NEXT:    movq %rsi, 8(%rdi)
-; AVX2-NEXT:    movq %rcx, (%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_ne_i128_bitcast:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %edx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rdx, %rsi
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    xorq %rdx, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    xorq %rsi, %rcx
-; AVX512-NEXT:    movq %rcx, 8(%rdi)
-; AVX512-NEXT:    movq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; AVX-LABEL: complement_ne_i128_bitcast:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    vmovq %xmm0, %rdx
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    andb $32, %cl
+; AVX-NEXT:    shrdq %cl, %rax, %rdx
+; AVX-NEXT:    shrxq %rcx, %rax, %rax
+; AVX-NEXT:    testb $64, %sil
+; AVX-NEXT:    cmoveq %rdx, %rax
+; AVX-NEXT:    btcl %esi, %eax
+; AVX-NEXT:    andl $96, %esi
+; AVX-NEXT:    shrl $3, %esi
+; AVX-NEXT:    movl %eax, (%rdi,%rsi)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 2f691e7ca8f5b..58adbb767ed87 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <
 ; SCALAR-NEXT:   store i32 %Elt2, ptr %Ptr23, align 4
 
 define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
-; X64-LABEL: test6:
-; X64:       # %bb.0:
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    kxnorw %k0, %k0, %k2
-; X64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
-; X64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
-; X64-NEXT:    vmovdqa %ymm2, %ymm0
-; X64-NEXT:    retq
+; X64-KNL-LABEL: test6:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; X64-KNL-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; X64-KNL-NEXT:    vmovdqa %ymm2, %ymm0
+; X64-KNL-NEXT:    retq
 ;
 ; X86-KNL-LABEL: test6:
 ; X86-KNL:       # %bb.0:
@@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
 ; X86-KNL-NEXT:    vmovdqa %ymm2, %ymm0
 ; X86-KNL-NEXT:    retl
 ;
+; X64-SKX-LABEL: test6:
+; X64-SKX:       # %bb.0:
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
+; X64-SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; X64-SKX-NEXT:    vmovdqa %ymm2, %ymm0
+; X64-SKX-NEXT:    retq
+;
 ; X86-SKX-LABEL: test6:
 ; X86-SKX:       # %bb.0:
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k2
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k2
 ; X86-SKX-NEXT:    vpgatherdd (,%ymm1), %ymm2 {%k2}
 ; X86-SKX-NEXT:    vpscatterdd %ymm0, (,%ymm1) {%k1}
 ; X86-SKX-NEXT:    vmovdqa %ymm2, %ymm0
@@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; X64-SKX-SMALL-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
 ; X64-SKX-SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-SMALL-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-SMALL-NEXT:    retq
@@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; X64-SKX-LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-LARGE-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-LARGE-NEXT:    retq
@@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; X86-SKX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpgatherdd 68(,%ymm1), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; X64-SKX-SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; X64-SKX-SMALL-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
 ; X64-SKX-SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-SMALL-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-SMALL-NEXT:    retq
@@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; X64-SKX-LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-LARGE-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
 ; X64-SKX-LARGE-NEXT:    retq
@@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; X86-SKX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
 ; X86-SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpgatherdd 68(,%ymm1), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
 
 declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>)
 define <16 x ptr> @test31(<16 x ptr> %ptrs) {
-; X64-LABEL: test31:
-; X64:       # %bb.0:
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; X64-NEXT:    kxnorw %k0, %k0, %k2
-; X64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k2}
-; X64-NEXT:    vpgatherqq (,%zmm1), %zmm2 {%k1}
-; X64-NEXT:    vmovdqa64 %zmm3, %zmm0
-; X64-NEXT:    vmovdqa64 %zmm2, %zmm1
-; X64-NEXT:    retq
+; X64-KNL-LABEL: test31:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k2}
+; X64-KNL-NEXT:    vpgatherqq (,%zmm1), %zmm2 {%k1}
+; X64-KNL-NEXT:    vmovdqa64 %zmm3, %zmm0
+; X64-KNL-NEXT:    vmovdqa64 %zmm2, %zmm1
+; X64-KNL-NEXT:    retq
 ;
 ; X86-LABEL: test31:
 ; X86:       # %bb.0:
@@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) {
 ; X86-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; X86-NEXT:    retl
+;
+; X64-SKX-LABEL: test31:
+; X64-SKX:       # %bb.0:
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k2}
+; X64-SKX-NEXT:    vpgatherqq (,%zmm1), %zmm2 {%k1}
+; X64-SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
+; X64-SKX-NEXT:    vmovdqa64 %zmm2, %zmm1
+; X64-SKX-NEXT:    retq
   %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef)
   ret <16 x ptr>%res
 }
@@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
 ; X64-KNL-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
-; X86-LABEL: test_global_array:
-; X86:       # %bb.0:
-; X86-NEXT:    kxnorw %k0, %k0, %k1
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
-; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    retl
+; X86-KNL-LABEL: test_global_array:
+; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-SMALL-LABEL: test_global_array:
 ; X64-SKX-SMALL:       # %bb.0:
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-SMALL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
 ; X64-SKX-LARGE-LABEL: test_global_array:
 ; X64-SKX-LARGE:       # %bb.0:
 ; X64-SKX-LARGE-NEXT:    movabsq $glob_array, %rax
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-LARGE-NEXT:    vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-SKX-LARGE-NEXT:    retq
+;
+; X86-SKX-LABEL: test_global_array:
+; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-SKX-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-SKX-NEXT:    retl
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs
   %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %g
@@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
 ; X64-KNL-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-KNL-NEXT:    retq
 ;
-; X86-LABEL: test_global_array_zeroinitializer_index:
-; X86:       # %bb.0:
-; X86-NEXT:    kxnorw %k0, %k0, %k1
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
-; X86-NEXT:    vmovdqa %ymm1, %ymm0
-; X86-NEXT:    retl
+; X86-KNL-LABEL: test_global_array_zeroinitializer_index:
+; X86-KNL:       # %bb.0:
+; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-KNL-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-KNL-NEXT:    retl
 ;
 ; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index:
 ; X64-SKX-SMALL:       # %bb.0:
-; X64-SKX-SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-SMALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-SMALL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
 ; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index:
 ; X64-SKX-LARGE:       # %bb.0:
 ; X64-SKX-LARGE-NEXT:    movabsq $glob_array, %rax
-; X64-SKX-LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-LARGE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-LARGE-NEXT:    vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-SKX-LARGE-NEXT:    retq
+;
+; X86-SKX-LABEL: test_global_array_zeroinitializer_index:
+; X86-SKX:       # %bb.0:
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-SKX-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-SKX-NEXT:    retl
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
   %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %g
@@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X64-SKX-LABEL: sext_v8i8_index:
 ; X64-SKX:       # %bb.0:
 ; X64-SKX-NEXT:    vpmovsxbd %xmm0, %ymm1
-; X64-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
 ; X64-SKX-NEXT:    retq
@@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpmovsxbd %xmm0, %ymm1
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X64-SKX-LABEL: zext_v8i8_index:
 ; X64-SKX:       # %bb.0:
 ; X64-SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X64-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X64-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
 ; X64-SKX-NEXT:    retq
@@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
 ; X86-SKX:       # %bb.0:
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X86-SKX-NEXT:    kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
 ; X86-SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
 ; X86-SKX-NEXT:    retl
@@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) {
 }
 
 define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
-; X64-LABEL: pr163023_zext:
-; X64:       # %bb.0:
-; X64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; X64-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; X64-NEXT:    kxnorw %k0, %k0, %k2
-; X64-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
-; X64-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
-; X64-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
-; X64-NEXT:    retq
+; X64-KNL-LABEL: pr163023_zext:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X64-KNL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-KNL-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-KNL-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-KNL-NEXT:    retq
 ;
 ; X86-LABEL: pr163023_zext:
 ; X86:       # %bb.0:
@@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
 ; X86-NEXT:    vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; X86-NEXT:    retl
+;
+; X64-SKX-LABEL: pr163023_zext:
+; X64-SKX:       # %bb.0:
+; X64-SKX-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X64-SKX-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-SKX-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-SKX-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-SKX-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-SKX-NEXT:    retq
   %addr.p = ptrtoint ptr %a0 to i64
   %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
   %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
@@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
 %struct.foo = type { ptr, i64, i16, i16, i32 }
 
 define <8 x i64> @pr45906(<8 x ptr> %ptr) {
-; X64-LABEL: pr45906:
-; X64:       # %bb.0: # %bb
-; X64-NEXT:    kxnorw %k0, %k0, %k1
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpgatherqq 8(,%zmm0), %zmm1 {%k1}
-; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X64-NEXT:    retq
+; X64-KNL-LABEL: pr45906:
+; X64-KNL:       # %bb.0: # %bb
+; X64-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT:    vpgatherqq 8(,%zmm0), %zmm1 {%k1}
+; X64-KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X64-KNL-NEXT:    retq
 ;
-; X86-LABEL: pr45906:
-; X86:       # %bb.0: # %bb
-; X86-NEXT:    kxnorw %k0, %k0, %k1
-; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vpgatherdq 4(,%ymm0), %zmm1 {%k1}
-; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X86-NEXT:    retl
+; X86-KNL-LABEL: pr45906:
+; X86-KNL:       # %bb.0: # %bb
+; X86-KNL-NEXT:    kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT:    vpgatherdq 4(,%ymm0), %zmm1 {%k1}
+; X86-KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-KNL-NEXT:    retl
+;
+; X64-SKX-LABEL: pr45906:
+; X64-SKX:       # %bb.0: # %bb
+; X64-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-SKX-NEXT:    vpgatherqq 8(,%zmm0), %zmm1 {%k1}
+; X64-SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X64-SKX-NEXT:    retq
+;
+; X86-SKX-LABEL: pr45906:
+; X86-SKX:       # %bb.0: # %bb
+; X86-SKX-NEXT:    kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT:    vpgatherdq 4(,%ymm0), %zmm1 {%k1}
+; X86-SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-SKX-NEXT:    retl
 bb:
   %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1
   %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll
index 762a050247a87..36bf31395d6d5 100644
--- a/llvm/test/CodeGen/X86/scatter-schedule.ll
+++ b/llvm/test/CodeGen/X86/scatter-schedule.ll
@@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqd %ymm2, (,%zmm0) {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxnorb %k0, %k0, %k1
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; CHECK-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index a768baae97add..466fa6ba098b3 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ-SLOW:       # %bb.0:
 ; AVX512DQ-SLOW-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-SLOW-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
-; AVX512DQ-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-SLOW-NEXT:    movw $255, %ax
-; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
-; AVX512DQ-SLOW-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
-; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm0, %k2
-; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-SLOW-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; AVX512DQ-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT:    kxnorb %k0, %k0, %k2
+; AVX512DQ-SLOW-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-SLOW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, (%rdx)
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm0, 64(%rdx)
 ; AVX512DQ-SLOW-NEXT:    vzeroupper
@@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ-FAST:       # %bb.0:
 ; AVX512DQ-FAST-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-FAST-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
-; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-FAST-NEXT:    movw $255, %ax
-; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT:    vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
 ; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
-; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm0, %k2
-; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT:    vpmovd2m %zmm1, %k1
+; AVX512DQ-FAST-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
+; AVX512DQ-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT:    kxnorb %k0, %k0, %k2
+; AVX512DQ-FAST-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-FAST-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm1, (%rdx)
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm0, 64(%rdx)
 ; AVX512DQ-FAST-NEXT:    vzeroupper
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s
new file mode 100644
index 0000000000000..fca8345ff207c
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_ehframe.s
@@ -0,0 +1,58 @@
+# REQUIRES: asserts
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux-gnu -filetype=obj -o %t %s
+# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t 2>&1 | \
+# RUN:   FileCheck %s
+#
+# Check that splitting of eh-frame sections works.
+#
+# CHECK: DWARFRecordSectionSplitter: Processing .eh_frame...
+# CHECK:   Processing block at
+# CHECK:     Processing CFI record at
+# CHECK:     Processing CFI record at
+# CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"...
+# CHECK:   Processing block at
+# CHECK:     Record is CIE
+# CHECK:   Processing block at
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Processing PC-begin at
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+
+	.text
+	.file	"exceptions.cpp"
+                                        # Start of file scope inline assembly
+	.globl	_ZSt21ios_base_library_initv
+
+                                        # End of file scope inline assembly
+	.globl	main                            # -- Begin function main
+	.p2align	4
+	.type	main,@function
+main:                                   # @main
+	.cfi_startproc
+# %bb.0:                                # %entry
+	stmg	%r11, %r15, 88(%r15)
+	.cfi_offset %r11, -72
+	.cfi_offset %r14, -48
+	.cfi_offset %r15, -40
+	aghi	%r15, -168
+	.cfi_def_cfa_offset 328
+	lgr	%r11, %r15
+	.cfi_def_cfa_register %r11
+	mvhi	164(%r11), 0
+	lghi	%r2, 4
+	brasl	%r14, __cxa_allocate_exception@PLT
+	mvhi	0(%r2), 1
+	lgrl	%r3, _ZTIi@GOT
+	lghi	%r4, 0
+	brasl	%r14, __cxa_throw@PLT
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym __cxa_allocate_exception
+	.addrsig_sym __cxa_throw
+	.addrsig_sym _ZTIi
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s
new file mode 100644
index 0000000000000..04e828685c040
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs16.s
@@ -0,0 +1,35 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0xFFFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs X=0x10000 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_16 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer16 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+	.p2align 1
+P:
+        .short   0
+        .short   0
+        .short   0
+        .short   X    # Using byte here generates R_390_16.
+        .size    P, 8
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s
new file mode 100644
index 0000000000000..1a63acdb63d57
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs32.s
@@ -0,0 +1,32 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0x12345678 -check=%s %t.o
+#
+# RUN: not llvm-jitlink -noexec -abs X=0x123456789 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_32 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer32 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br  %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+        .p2align        2
+P:
+        .long   0
+        .long   X    # Using long here generates R_390_32.
+        .size   P, 8
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s
new file mode 100644
index 0000000000000..63d2a1a539aeb
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs64.s
@@ -0,0 +1,28 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0xffffffffffffffff -check=%s %t.o
+#
+# Check success and failure cases of R_390_64 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target "X" {{.*}} is out of range of Pointer64 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br  %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+        .p2align       4 
+P:
+        .quad  X    # Using quad here generates R_390_64.
+        .size   P, 8
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s
new file mode 100644
index 0000000000000..5f23f289140a6
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_abs8.s
@@ -0,0 +1,38 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs X=0xFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs X=0x100 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_8 handling.
+
+# jitlink-check: *{8}P = X
+
+# CHECK-ERROR: relocation target {{.*}} (X) is out of range of Pointer8 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
+        .type   P,@object
+        .data
+        .globl  P
+P:
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   0
+        .byte   X    # Using byte here generates R_390_8.
+        .size   P, 8
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s
new file mode 100644
index 0000000000000..e38a2f39b7c22
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_call_pic.s
@@ -0,0 +1,96 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o  %t/elf_pic_reloc.o %s 
+#
+# RUN: llvm-jitlink -noexec \ 
+# RUN:     -slab-allocate 100Kb -slab-address 0xfff00000 -slab-page-size 4096 \
+# RUN:     -abs external_data=0x1 \
+# RUN:     -abs extern_out_of_range32=0x7fff00000000 \
+# RUN:     -abs extern_in_range32=0xffe00000 \
+# RUN:     -check %s %t/elf_pic_reloc.o
+# XFAIL: *
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+         br   %r14
+         .size main, .-main
+
+        .globl  named_func
+        .p2align       4
+        .type   named_func,@function
+named_func:
+	br    %r14
+        .size   named_func, .-named_func
+
+# Check R_390_PC32DBL handling with a call to a local function in the text
+# section. This produces a Delta32dbl edge that is resolved like a regular
+# direct relative branches(no PLT entry created).
+#
+# jitlink-check: decode_operand(test_call_local, 1) = \
+# jitlink-check:   named_func - test_call_local
+        .globl  test_call_local
+        .p2align       4
+        .type   test_call_local,@function
+test_call_local:
+        brasl  %r14, named_func@PLT 
+
+        .size   test_call_local, .-test_call_local
+
+# Check R_390_PLT32dbl(DeltaPLT32dbl)  handling with a call to an 
+# external via PLT. This produces a Delta32dbl edge, because externals are 
+# not defined locally. As the target is out-of-range from the callsite, 
+# the edge keeps using its PLT entry.
+#
+# jitlink-check: decode_operand(test_call_extern_plt, 1) = \
+# jitlink-check:     stub_addr(elf_pic_reloc.o, extern_out_of_range32) - \
+# jitlink-check:        test_call_extern_plt
+# jitlink-check: *{8}(got_addr(elf_pic_reloc.o, extern_out_of_range32)) = \
+# jitlink-check:     extern_out_of_range32
+        .globl  test_call_extern_plt
+        .p2align       4
+        .type   test_call_extern_plt,@function
+test_call_extern_plt:
+        brasl   %r14, extern_out_of_range32@plt
+
+        .size   test_call_extern_plt, .-test_call_extern_plt
+
+# Check PLT stub relocation for lgrl(Delta32dbl). 
+#
+# jitlink-check: *{4}(stub_addr(elf_pic_reloc.o, extern_out_of_range32) + 2) = \
+# jitlink-check: (got_addr(elf_pic_reloc.o, extern_out_of_range32) - \
+# jitlink-check:   stub_addr(elf_pic_reloc.o, extern_out_of_range32)) >> 1
+        .globl  test_call_extern_plt_stub
+        .p2align       4
+        .type   test_call_extern_plt_stub,@function
+test_call_extern_plt_stub:
+        brasl   %r14, extern_out_of_range32@plt
+
+        .size   test_call_extern_plt_stub, .-test_call_extern_plt_stub
+
+# Check R_390_PLT32(DeltaPLT32dbl) handling with a call to an external. 
+# This produces a Delta32dbl edge, because externals are not defined 
+# locally. During resolution, the target turns out to be in-range from the 
+# callsite.
+### TODO: edge can be relaxed in post-allocation optimization, it will then
+### require:
+### jitlink-check: decode_operand(test_call_extern, 1) = \
+### jitlink-check:     extern_in_range32 - test_call_extern
+#
+# Same as test_call_extern_plt(no-optimization)
+# jitlink-check: decode_operand(test_call_extern, 1) = \
+# jitlink-check:     stub_addr(elf_pic_reloc.o, extern_in_range32) - \
+# jitlink-check:        test_call_extern
+# jitlink-check: *{8}(got_addr(elf_pic_reloc.o, extern_in_range32)) = \
+# jitlink-check:     extern_in_range32
+        .globl  test_call_extern
+        .p2align       4
+        .type   test_call_extern,@function
+test_call_extern:
+        brasl   %r14, extern_in_range32@plt
+        .size   test_call_extern, .-test_call_extern
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s
new file mode 100644
index 0000000000000..cf12cdc987ce3
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp12.s
@@ -0,0 +1,28 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs DISP=0xFFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs  DISP=0x1000 %t.o 2>&1 | \
+# RUN:  FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_12 handling.
+
+# CHECK-ERROR: relocation target {{.*}} (DISP) is out of range of
+# CHECK-ERROR: Pointer12 fixup
+
+# jitlink-check: decode_operand(main, 2) = DISP
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+    	.reloc .+2, R_390_12, DISP 
+    	l %r6, 0(%r7,%r8)
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s
new file mode 100644
index 0000000000000..5c7de535cf8b4
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_disp20.s
@@ -0,0 +1,31 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs DISP=0x7FFFF -check=%s %t.o
+
+# RUN: not llvm-jitlink -noexec -abs DISP=0x80000 %t.o 2>&1 | \
+# RUN:  FileCheck -check-prefix=CHECK-ERROR %s
+
+# RUN: not llvm-jitlink -noexec -abs DISP=0xFFFFF %t.o 2>&1 | \
+# RUN:  FileCheck -check-prefix=CHECK-ERROR %s
+#
+# Check success and failure cases of R_390_20 handling.
+
+# CHECK-ERROR: relocation target {{.*}} (DISP) is out of range of
+# CHECK-ERROR: Pointer20 fixup
+
+# jitlink-check: decode_operand(main, 2) = DISP
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+    	.reloc .+2, R_390_20, DISP
+    	lg %r6, 0(%r7,%r8)
+        br    %r14
+.Lfunc_end0:
+        .size   main, .Lfunc_end0-main
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s
new file mode 100644
index 0000000000000..7da48cfa704e2
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_got.s
@@ -0,0 +1,244 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:    -slab-allocate 100Kb -slab-address 0x6ff00000 -slab-page-size 4096 \
+# RUN:    -abs foo=0x6ff04040 \
+# RUN:    -abs bar=0x6ff04048 \
+# RUN:     %t/elf_reloc.o -check %s
+
+# Verifying GOT related relocations.
+
+        .text
+        .globl  main
+        .type   main,@function
+main:
+# jitlink-check: decode_operand(main, 1) = _GLOBAL_OFFSET_TABLE_ - main 
+	larl %r12, _GLOBAL_OFFSET_TABLE_
+	.globl test_gotent_foo
+test_gotent_foo:
+# jitlink-check: decode_operand(test_gotent_foo, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, foo) - test_gotent_foo)
+	.reloc .+2, R_390_GOTENT, foo+2
+	larl %r1, 0
+	.size test_gotent_foo, .-test_gotent_foo
+
+	.globl test_gotent_bar
+test_gotent_bar:
+# jitlink-check: decode_operand(test_gotent_bar, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, bar) - test_gotent_bar) 
+	.reloc .+2, R_390_GOTENT, bar+2
+	larl %r1, 0
+        .size test_gotent_bar, .-test_gotent_bar
+
+        .globl test_gotpltent_foo
+test_gotpltent_foo:
+# jitlink-check: decode_operand(test_gotpltent_foo, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, foo) - test_gotpltent_foo)
+	.reloc .+2, R_390_GOTPLTENT, foo+2
+	larl %r1, 0
+	.size test_gotpltent_foo, .-test_gotpltent_foo
+
+        .globl test_gotpltent_bar
+test_gotpltent_bar:
+# jitlink-check: decode_operand(test_gotpltent_bar, 1) = \
+# jitlink-check:          (got_addr(elf_reloc.o, bar) - test_gotpltent_bar)
+	.reloc .+2, R_390_GOTPLTENT, bar+2
+	larl %r1, 0
+        .size test_gotpltent_bar, .-test_gotpltent_bar
+
+       .globl test_got12_foo
+test_got12_foo:
+# jitlink-check: decode_operand(test_got12_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+	.reloc .+2, R_390_GOT12, foo
+	l %r1, 0(%r12)
+        .size test_got12_foo, .-test_got12_foo
+
+      .globl test_got12_bar
+test_got12_bar:
+# jitlink-check: decode_operand(test_got12_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+	.reloc .+2, R_390_GOT12, bar
+	l %r1, 0(%r12)
+        .size test_got12_bar, .-test_got12_bar
+
+       .globl test_gotplt12_foo
+test_gotplt12_foo:
+# jitlink-check: decode_operand(test_gotplt12_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+	.reloc .+2, R_390_GOTPLT12, foo
+	l %r1, 0(%r12)
+        .size test_gotplt12_foo, .-test_gotplt12_foo
+
+       .globl test_gotplt12_bar
+test_gotplt12_bar:
+# jitlink-check: decode_operand(test_gotplt12_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOTPLT12, bar 
+        l %r1, 0(%r12)
+        .size test_gotplt12_bar, .-test_gotplt12_bar
+
+       .globl test_got20_foo
+test_got20_foo:
+# jitlink-check: decode_operand(test_got20_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOT20, foo
+        lg %r1, 0(%r12)
+        .size test_got20_foo, .-test_got20_foo
+
+      .globl test_got20_bar
+test_got20_bar:
+# jitlink-check: decode_operand(test_got20_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOT20, bar
+        lg %r1, 0(%r12)
+        .size test_got20_bar, .-test_got20_bar
+
+       .globl test_gotplt20_foo
+test_gotplt20_foo:
+# jitlink-check: decode_operand(test_gotplt20_foo, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOTPLT20, foo
+        lg %r1, 0(%r12)
+        .size test_gotplt20_foo, .-test_gotplt20_foo
+
+       .globl test_gotplt20_bar
+test_gotplt20_bar:
+# jitlink-check: decode_operand(test_gotplt20_bar, 2) = \
+# jitlink-check:       (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+        .reloc .+2, R_390_GOTPLT20, bar
+        lg %r1, 0(%r12)
+        .size test_gotplt20_bar, .-test_gotplt20_bar
+        br  %r14
+	.size   main, .-main
+
+	.data
+	.globl test_got16_foo
+# jitlink-check: *{2}test_got16_foo = \
+# jitlink-check:     (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_got16_foo:
+	.reloc ., R_390_GOT16, foo
+	.space 2
+	.size test_got16_foo, .-test_got16_foo
+
+       .globl test_got16_bar
+# jitlink-check: *{2}test_got16_bar = \
+# jitlink-check:     (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_got16_bar:
+        .reloc ., R_390_GOT16, bar
+        .space 2
+        .size test_got16_bar, .-test_got16_bar
+
+        .globl test_gotplt16_foo
+# jitlink-check: *{2}test_gotplt16_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt16_foo:
+        .reloc ., R_390_GOTPLT16, foo
+        .space 2
+        .size test_gotplt16_foo, .-test_gotplt16_foo
+
+       .globl test_gotplt16_bar
+# jitlink-check: *{2}test_gotplt16_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt16_bar:
+        .reloc ., R_390_GOTPLT16, bar
+        .space 2
+        .size test_gotplt16_bar, .-test_gotplt16_bar
+
+        .globl test_got32_foo
+# jitlink-check: *{4}test_got32_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_got32_foo:
+        .reloc ., R_390_GOT32, foo
+        .space 4 
+        .size test_got32_foo, .-test_got32_foo
+
+       .globl test_got32_bar
+# jitlink-check: *{4}test_got32_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_got32_bar:
+        .reloc ., R_390_GOT32, bar
+        .space 4 
+        .size test_got32_bar, .-test_got32_bar
+
+        .globl test_gotplt32_foo
+# jitlink-check: *{4}test_gotplt32_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt32_foo:
+        .reloc ., R_390_GOTPLT32, foo
+        .space 4 
+        .size test_gotplt32_foo, .-test_gotplt32_foo
+
+       .globl test_gotplt32_bar
+# jitlink-check: *{4}test_gotplt32_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt32_bar:
+        .reloc ., R_390_GOTPLT32, bar
+        .space 4 
+        .size test_gotplt32_bar, .-test_gotplt32_bar
+
+        .globl test_got64_foo
+# jitlink-check: *{8}test_got64_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_got64_foo:
+        .reloc ., R_390_GOT64, foo
+        .space 8 
+        .size test_got64_foo, .-test_got64_foo
+
+       .globl test_got64_bar
+# jitlink-check: *{8}test_got64_bar = \
+# jitlink-check:    (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_got64_bar:
+        .reloc ., R_390_GOT64, bar
+        .space 8 
+        .size test_got64_bar, .-test_got64_bar
+
+        .globl test_gotplt64_foo
+# jitlink-check: *{8}test_gotplt64_foo = \
+# jitlink-check:    (got_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt64_foo:
+        .reloc ., R_390_GOTPLT64, foo
+        .space 8 
+        .size test_gotplt64_foo, .-test_gotplt64_foo
+
+       .globl test_gotplt64_bar
+# jitlink-check: *{8}test_gotplt64_bar = \
+# jitlink-check:  (got_addr(elf_reloc.o, bar) - _GLOBAL_OFFSET_TABLE_)
+test_gotplt64_bar:
+        .reloc ., R_390_GOTPLT64, bar
+        .space 8 
+        .size test_gotplt64_bar, .-test_gotplt64_bar
+
+        .globl test_gotpc_foo
+# jitlink-check: *{4}test_gotpc_foo = _GLOBAL_OFFSET_TABLE_ - test_gotpc_foo
+test_gotpc_foo:
+        .reloc ., R_390_GOTPC, foo
+        .space 4 
+        .size test_gotpc_foo, .-test_gotpc_foo
+
+        .globl test_gotpc_bar
+# jitlink-check: *{4}test_gotpc_bar = _GLOBAL_OFFSET_TABLE_ - test_gotpc_bar
+test_gotpc_bar:
+        .reloc ., R_390_GOTPC, bar 
+        .space 4
+        .size test_gotpc_bar, .-test_gotpc_bar
+
+        .globl test_gotpcdbl_foo
+# jitlink-check: *{4}test_gotpcdbl_foo = \
+# jitlink-check:           (_GLOBAL_OFFSET_TABLE_ - test_gotpcdbl_foo) >> 1
+test_gotpcdbl_foo:
+        .reloc ., R_390_GOTPCDBL, foo
+        .space 4
+        .size test_gotpcdbl_foo, .-test_gotpcdbl_foo
+
+        .globl test_gotpcdbl_bar
+# jitlink-check: *{4}test_gotpcdbl_bar =  \
+# jitlink-check:           (_GLOBAL_OFFSET_TABLE_ - test_gotpcdbl_bar) >> 1
+test_gotpcdbl_bar:
+        .reloc ., R_390_GOTPCDBL, bar
+        .space 4
+        .size test_gotpcdbl_bar, .-test_gotpcdbl_bar
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s
new file mode 100644
index 0000000000000..af37b3f75ca42
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_gotrel.s
@@ -0,0 +1,68 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:    -slab-allocate 100Kb -slab-address 0x6ff00000 -slab-page-size 4096 \
+# RUN:    -abs foo=0x6ff04080 \
+# RUN:    -abs bar=0x6ff04040 \
+# RUN:     %t/elf_reloc.o -check %s
+
+        .text
+        .globl  main
+        .type   main,@function
+main:
+        br  %r14
+	.size   main, .-main
+
+	.data
+	.globl test_gotoff16_bar
+# jitlink-check: *{2}test_gotoff16_bar = (bar - _GLOBAL_OFFSET_TABLE_) & 0xffff
+test_gotoff16_bar:
+	.reloc ., R_390_GOTOFF16, bar
+	.space 2
+	.size test_gotoff16_bar, .-test_gotoff16_bar
+
+       .globl test_pltoff16_foo
+# jitlink-check: *{2}test_pltoff16_foo =  \
+# jitlink-check:      (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) \
+# jitlink-check:       & 0xffff
+test_pltoff16_foo:
+        .reloc ., R_390_PLTOFF16, foo 
+        .space 2
+        .size test_pltoff16_foo, .-test_pltoff16_foo
+
+
+       .globl test_gotoff32_bar
+# jitlink-check: *{4}test_gotoff32_bar = (bar - _GLOBAL_OFFSET_TABLE_) \
+# jitlink-check:       & 0xffffffff
+test_gotoff32_bar:
+        .reloc ., R_390_GOTOFF, bar
+        .space 4 
+        .size test_gotoff32_bar, .-test_gotoff32_bar
+
+        .globl test_pltoff32_foo
+# jitlink-check: *{4}test_pltoff32_foo = \
+# jitlink-check:      (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_) \
+# jitlink-check:       & 0xffffffff
+test_pltoff32_foo:
+        .reloc ., R_390_PLTOFF32, foo
+        .space 4 
+        .size test_pltoff32_foo, .-test_pltoff32_foo
+
+	 .globl test_gotoff64_bar
+# jitlink-check: *{8}test_gotoff64_bar = bar - _GLOBAL_OFFSET_TABLE_
+test_gotoff64_bar:
+        .reloc ., R_390_GOTOFF64, bar
+        .space 8 
+        .size test_gotoff64_bar, .-test_gotoff64_bar
+
+        .globl test_pltoff64_foo
+# jitlink-check: *{8}test_pltoff64_foo = \
+# jitlink-check:      (stub_addr(elf_reloc.o, foo) - _GLOBAL_OFFSET_TABLE_)
+test_pltoff64_foo:
+        .reloc ., R_390_PLTOFF64, foo
+        .space 8 
+        .size test_pltoff64_foo, .-test_pltoff64_foo
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s
new file mode 100644
index 0000000000000..4b3a65e53ab93
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc.s
@@ -0,0 +1,20 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec %t.o
+#
+# Check R_390_PC* handling.
+
+        .text
+        .globl  main
+        .type   main,@function
+main:
+        br      %r14 
+        .size   main, .-main
+
+        .rodata
+        .short  main-. # Generate R_390_PC16 relocation.
+        .long   main-. # Generate R_390_PC32 relocation.
+        .quad   main-. # Generate R_390_PC64 relocation.
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s
new file mode 100644
index 0000000000000..0da54b2a58972
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc16.s
@@ -0,0 +1,41 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x8000 -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs OFFSET=0x8000 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0xFFFF -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0xFFFF %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x8001 -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0x8001 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# jitlink-check: *{2}test_pc16 = OFFSET
+# jitlink-check: *{2}test_pc16dbl = OFFSET
+
+# CHECK-ERROR:  {{.*}} is out of range of Delta16 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+        .size   main, .-main
+
+	.globl test_pc16
+test_pc16:
+  	.reloc test_pc16, R_390_PC16, .-OFFSET
+	.space 2
+  	.size test_pc16, .-test_pc16 
+
+	.globl test_pc16dbl
+test_pc16dbl:
+  	.reloc test_pc16dbl, R_390_PC16DBL, .-(OFFSET + OFFSET)
+	.space 2
+  	.size test_pc16dbl, .-test_pc16dbl
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s
new file mode 100644
index 0000000000000..503fd2d0a5d49
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc32.s
@@ -0,0 +1,41 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x80000000 -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -noexec -abs OFFSET=0x80000000 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0xFFFFFFFF -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0xFFFFFFFF %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -defsym OFFSET=0x80000001 -filetype=obj -o %t.o %s
+# RUN: not llvm-jitlink -noexec -abs OFFSET=0x80000001 %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=CHECK-ERROR %s
+#
+# jitlink-check: *{4}test_pc32 = OFFSET
+# jitlink-check: *{4}test_pc32dbl = OFFSET
+
+# CHECK-ERROR:  {{.*}} is out of range of Delta32 fixup
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+        .size   main, .-main
+
+	.globl test_pc32
+test_pc32:
+  	.reloc test_pc32, R_390_PC32, .-OFFSET
+	.space 4 
+  	.size test_pc32, .-test_pc32 
+
+	.globl test_pc32dbl
+test_pc32dbl:
+  	.reloc test_pc32dbl, R_390_PC32DBL, .-(OFFSET + OFFSET)
+	.space 4 
+  	.size test_pc32dbl, .-test_pc32dbl 
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s
new file mode 100644
index 0000000000000..0d33ae2976de5
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pc64.s
@@ -0,0 +1,34 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o  %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0xffff0000 -slab-page-size 4096 \
+# RUN:     -abs external_data=0x1 \
+# RUN:    -abs foo=0x6ff04040 \
+# RUN:    -abs bar=0x6ff04048 \
+# RUN:     -check %s %t/elf_reloc.o
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br    %r14
+        .size   main, .-main
+
+        .globl test_pc64_foo
+# jitlink-check: *{8}test_pc64_foo = foo - test_pc64_foo
+test_pc64_foo:
+        .reloc ., R_390_PC64, foo
+        .space 8
+        .size test_pc64_foo, .-test_pc64_foo
+
+        .globl test_pc64_bar
+# jitlink-check: *{8}test_pc64_bar = bar - test_pc64_bar
+test_pc64_bar:
+        .reloc ., R_390_PC64, bar 
+        .space 8
+        .size test_pc64_bar, .-test_pc64_bar
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s
new file mode 100644
index 0000000000000..efe8357e76bef
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pcdbl.s
@@ -0,0 +1,84 @@
+# REQUIRES: system-linux
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=0xffe -defsym OFF16=4 -defsym OFF24=6 \
+# RUN:         -defsym OFF32=6 -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=0xffe -abs OFF16=4 -abs OFF24=6 \
+# RUN:                      -abs OFF32=6 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=6 -defsym OFF16=0xfffe -defsym OFF24=6 \
+# RUN:         -defsym OFF32=6 -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=0xfffe -abs OFF24=6 \
+# RUN:                      -abs OFF32=6 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=6 -defsym OFF16=4 -defsym OFF24=0xfffffe \
+# RUN:         -defsym OFF32=6 -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=4 -abs OFF24=0xfffffe \
+# RUN:                      -abs OFF32=6 -check=%s %t.o
+#
+# RUN: llvm-mc -triple=systemz-unknown-linux -mcpu=z16 -position-independent \
+# RUN:         -defsym OFF12=6 -defsym OFF16=4 -defsym OFF24=6 \
+# RUN:         -defsym OFF32=0xffffffc8 -filetype=obj -o %t.o %s
+#
+# RUN: llvm-jitlink -noexec -abs OFF12=6 -abs OFF16=4 -abs OFF24=6 \
+# RUN:                      -abs OFF32=0xffffffc8 -check=%s %t.o
+
+# Check R_390_PC*dbl relocations.
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br   %r14
+        .size main, .-main
+
+# R_390_PC16DBL
+# jitlink-check: *{2}(test_pc16dbl + 2) = (OFF16 >> 1)
+        .globl  test_pc16dbl
+        .p2align 3 
+test_pc16dbl:
+        je   .Lpc16dbl
+	.space OFF16 - 4
+.Lpc16dbl:
+        jne  test_pc16dbl 
+        .size test_pc16dbl,.-test_pc16dbl
+
+# R_390_PC32DBL
+# jitlink-check: *{4}(test_pc32dbl + 2) = (OFF32 >> 1)
+        .globl  test_pc32dbl
+        .p2align 3 
+test_pc32dbl:
+        jge   .Lpc32dbl
+	.space OFF32 - 6 
+.Lpc32dbl:
+        jgne  test_pc32dbl
+        .size test_pc32dbl,.-test_pc32dbl
+
+# R_390_PC12DBL
+# jitlink-check: ((*{2} (test_pc12dbl + 1)) & 0x0fff) = (OFF12 >> 1)
+        .globl  test_pc12dbl
+        .p2align 4 
+test_pc12dbl:
+        bprp  0, .Lpc12dbl, 0 
+        .space OFF12 - 6
+.Lpc12dbl:
+        bprp  0, test_pc12dbl, 0 
+        .size test_pc12dbl,.-test_pc12dbl
+
+# R_390_PC24DBL
+# jitlink-check: ((*{4} (test_pc24dbl + 2)) & 0x0ffffff) = (OFF24 >> 1)
+        .globl  test_pc24dbl
+        .p2align 4 
+test_pc24dbl:
+        bprp  0, 0, .Lpc24dbl
+        .space OFF24 - 6
+.Lpc24dbl:
+        bprp  0, 0, test_pc24dbl
+        .size test_pc24dbl,.-test_pc24dbl
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s
new file mode 100644
index 0000000000000..47f064b45816a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_plt.s
@@ -0,0 +1,71 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o  %t/elf_reloc.o %s
+#
+# RUN: llvm-jitlink -noexec \
+# RUN:     -slab-allocate 100Kb -slab-address 0xffff0000 -slab-page-size 4096 \
+# RUN:     -abs external_data=0x1 \
+# RUN:    -abs foo=0x6ff04040 \
+# RUN:    -abs bar=0x6ff04048 \
+# RUN:     -check %s %t/elf_reloc.o
+
+# Check R_390_PLT32/64 relocations.
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br   %r14
+        .size main, .-main
+
+        .globl test_plt32_foo
+# jitlink-check: *{4}test_plt32_foo = \
+# jitlink-check:  stub_addr(elf_reloc.o, foo) - test_plt32_foo
+test_plt32_foo:
+        .reloc ., R_390_PLT32, foo
+        .space 4
+        .size test_plt32_foo, .-test_plt32_foo
+
+        .globl test_plt32_bar
+# jitlink-check: *{4}test_plt32_bar = \
+# jitlink-check:  stub_addr(elf_reloc.o, bar) - test_plt32_bar
+test_plt32_bar:
+        .reloc ., R_390_PLT32, bar
+        .space 4
+        .size test_plt32_bar, .-test_plt32_bar
+
+       .globl test_plt64_foo
+# jitlink-check: *{8}test_plt64_foo = \
+# jitlink-check:  stub_addr(elf_reloc.o, foo) - test_plt64_foo
+test_plt64_foo:
+        .reloc ., R_390_PLT64, foo
+        .space 8
+        .size test_plt64_foo, .-test_plt64_foo
+
+       .globl test_plt64_bar
+# jitlink-check: *{8}test_plt64_bar = \
+# jitlink-check:  stub_addr(elf_reloc.o, bar) - test_plt64_bar
+test_plt64_bar:
+        .reloc ., R_390_PLT64, bar
+        .space 8
+        .size test_plt64_bar, .-test_plt64_bar
+
+        .globl test_plt32dbl_foo
+# jitlink-check: *{4}test_plt32dbl_foo = \
+# jitlink-check:  (stub_addr(elf_reloc.o, foo) - test_plt32dbl_foo) >> 1
+test_plt32dbl_foo:
+        .reloc ., R_390_PLT32DBL, foo
+        .space 4
+        .size test_plt32dbl_foo, .-test_plt32dbl_foo
+
+        .globl test_plt32dbl_bar
+# jitlink-check: *{4}test_plt32dbl_bar = \
+# jitlink-check:  (stub_addr(elf_reloc.o, bar) - test_plt32dbl_bar) >> 1
+test_plt32dbl_bar:
+        .reloc ., R_390_PLT32DBL, bar
+        .space 4
+        .size test_plt32dbl_bar, .-test_plt32dbl_bar
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s
new file mode 100644
index 0000000000000..c36a77684008a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/ELF_systemz_reloc_pltdbl.s
@@ -0,0 +1,51 @@
+# REQUIRES: system-linux
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=systemz-unknown-linux -position-independent \
+# RUN:     -mcpu=z16 -filetype=obj -o %t/elf_reloc.o %s
+
+# RUN: llvm-jitlink -noexec \
+# RUN:    -abs external_addr12=0xffe \
+# RUN:    -abs external_addr16=0xfffe \
+# RUN:    -abs external_addr24=0xffffe \
+# RUN:     %t/elf_reloc.o -check %s
+
+
+        .text
+        .section        .text.main
+        .globl  main
+        .p2align        4
+        .type   main,@function
+main:
+        br   %r14
+        .size main, .-main
+
+# R_390_PLT16DBL
+# jitlink-check: *{2}(test_plt16dbl + 4) = \
+# jitlink-check:     (stub_addr(elf_reloc.o, external_addr16) - \
+# jitlink-check:                test_plt16dbl) >> 1
+        .globl  test_plt16dbl
+        .p2align 4 
+test_plt16dbl:
+	bpp   0, external_addr16@plt, 0
+        .size test_plt16dbl,.-test_plt16dbl
+
+# R_390_PLT12DBL
+# jitlink-check: ((*{2}(test_plt12dbl + 1)) & 0x0fff) = \
+# jitlink-check:      (stub_addr(elf_reloc.o, external_addr12) - \
+# jitlink-check:                 test_plt12dbl) >> 1
+        .globl  test_plt12dbl
+        .p2align 4 
+test_plt12dbl:
+        bprp  0, external_addr12@plt, 0 
+        .size test_plt12dbl,.-test_plt12dbl
+
+# R_390_PLT24DBL
+# jitlink-check: ((*{4}(test_plt24dbl + 2)) & 0x0ffffff) = \
+# jitlink-check:       (stub_addr(elf_reloc.o, external_addr24) - \
+# jitlink-check:                  test_plt24dbl) >> 1
+        .globl  test_plt24dbl
+        .p2align 4 
+test_plt24dbl:
+        bprp  0, 0, external_addr24@plt 
+        .size test_plt24dbl,.-test_plt24dbl
+
diff --git a/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg b/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg
new file mode 100644
index 0000000000000..caf81b69c06fd
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/systemz/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "SystemZ" in config.root.targets:
+  config.unsupported = True
diff --git a/llvm/test/ExecutionEngine/lit.local.cfg b/llvm/test/ExecutionEngine/lit.local.cfg
index 1951f140ea889..bbffee852e10e 100644
--- a/llvm/test/ExecutionEngine/lit.local.cfg
+++ b/llvm/test/ExecutionEngine/lit.local.cfg
@@ -1,4 +1,4 @@
-if config.root.native_target in ['Sparc', 'SystemZ', 'Hexagon']:
+if config.root.native_target in ['Sparc', 'Hexagon']:
     config.unsupported = True
 
 # ExecutionEngine tests are not expected to pass in a cross-compilation setup.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index 877484f5159fd..212a5c99676f4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -63,7 +63,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
 ; CHECK-NEXT:    store i32 [[STORE]], ptr [[NBRBOXES]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sgt i32 [[IV]], [[IBOX]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -114,7 +114,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; RVA23-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; RVA23-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23:       middle.block:
 ; RVA23-NEXT:    br label [[LOOP:%.*]]
 ; RVA23:       exit:
@@ -141,7 +141,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23ZVL1024B-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; RVA23ZVL1024B-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; RVA23ZVL1024B-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23ZVL1024B:       middle.block:
 ; RVA23ZVL1024B-NEXT:    br label [[LOOP:%.*]]
 ; RVA23ZVL1024B:       exit:
@@ -185,16 +185,16 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index c37bf74f9c1b0..d08ca8c99e8ba 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -8,14 +8,14 @@ define void @pr63340(ptr %A, ptr %B) {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -55,11 +55,11 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[N]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, <4 x ptr> [[BROADCAST_SPLAT]], i64 [[N]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/tools/llvm-diff/lib/DifferenceEngine.h b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
index b829b2cd0bcbc..01fd0d9540dc2 100644
--- a/llvm/tools/llvm-diff/lib/DifferenceEngine.h
+++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.h
@@ -17,7 +17,6 @@
 #include "DiffConsumer.h"
 #include "DiffLog.h"
 #include "llvm/ADT/StringRef.h"
-#include <utility>
 
 namespace llvm {
   class Function;
diff --git a/llvm/tools/llvm-diff/llvm-diff.cpp b/llvm/tools/llvm-diff/llvm-diff.cpp
index 2126b91f75ae1..45b8ed91ce52c 100644
--- a/llvm/tools/llvm-diff/llvm-diff.cpp
+++ b/llvm/tools/llvm-diff/llvm-diff.cpp
@@ -20,11 +20,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
 #include <string>
-#include <utility>
-
 
 using namespace llvm;
 
diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 66c770d9ca86b..afc20bed25914 100644
--- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -8,7 +8,6 @@
 
 #include "MCInstrDescView.h"
 
-#include <iterator>
 #include <tuple>
 
 #include "llvm/ADT/STLExtras.h"
diff --git a/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
index b347cfdfc3923..902a1d79106f0 100644
--- a/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
+++ b/llvm/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
@@ -32,8 +32,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <utility>
-
 using namespace llvm;
 using namespace llvm::pdb;
 
diff --git a/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp b/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp
index 7ebca23ba7956..d692de75cfb89 100644
--- a/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp
+++ b/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/Statistic.h"
 #include <algorithm>
 #include <cstdint>
-#include <iterator>
 #include <queue>
 #include <sys/types.h>
 
diff --git a/llvm/unittests/ADT/CombinationGeneratorTest.cpp b/llvm/unittests/ADT/CombinationGeneratorTest.cpp
index 219e18bc5e12c..cf187e68479c0 100644
--- a/llvm/unittests/ADT/CombinationGeneratorTest.cpp
+++ b/llvm/unittests/ADT/CombinationGeneratorTest.cpp
@@ -13,7 +13,6 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <cstddef>
-#include <iterator>
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/unittests/ADT/FunctionExtrasTest.cpp b/llvm/unittests/ADT/FunctionExtrasTest.cpp
index 9809a92daac72..fdabdca269da2 100644
--- a/llvm/unittests/ADT/FunctionExtrasTest.cpp
+++ b/llvm/unittests/ADT/FunctionExtrasTest.cpp
@@ -11,7 +11,6 @@
 #include "gtest/gtest.h"
 
 #include <memory>
-#include <type_traits>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/ADT/IntervalMapTest.cpp b/llvm/unittests/ADT/IntervalMapTest.cpp
index 38f397ff2eb54..06848e218da10 100644
--- a/llvm/unittests/ADT/IntervalMapTest.cpp
+++ b/llvm/unittests/ADT/IntervalMapTest.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ADT/IntervalMap.h"
 #include "gtest/gtest.h"
-#include <type_traits>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/Support/HashBuilderTest.cpp b/llvm/unittests/Support/HashBuilderTest.cpp
index 0aacfcd185ba0..70cb92b21848c 100644
--- a/llvm/unittests/Support/HashBuilderTest.cpp
+++ b/llvm/unittests/Support/HashBuilderTest.cpp
@@ -15,7 +15,6 @@
 
 #include <list>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
diff --git a/llvm/unittests/Support/NativeFormatTests.cpp b/llvm/unittests/Support/NativeFormatTests.cpp
index ac04c5a53d74a..974709efce9df 100644
--- a/llvm/unittests/Support/NativeFormatTests.cpp
+++ b/llvm/unittests/Support/NativeFormatTests.cpp
@@ -10,8 +10,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
-#include <type_traits>
-
 using namespace llvm;
 
 namespace {
diff --git a/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp b/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp
index da713007f662d..321c092bc7518 100644
--- a/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp
+++ b/llvm/unittests/Target/SPIRV/SPIRVAPITest.cpp
@@ -20,7 +20,6 @@
 #include "gtest/gtest.h"
 #include <gmock/gmock.h>
 #include <string>
-#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index 1e9378845854e..c0daac127f71a 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <iterator>
 #include <tuple>
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.h b/llvm/utils/TableGen/Common/InfoByHwMode.h
index ef688a6f6b3d1..ce84960ef79a7 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.h
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.h
@@ -24,7 +24,6 @@
 #include <map>
 #include <string>
 #include <tuple>
-#include <utility>
 
 namespace llvm {
 
diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
new file mode 100644
index 0000000000000..64a42a228199e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
@@ -0,0 +1,21 @@
+//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
+//
+// Part of the APFloat Project, under the Apache License v2.0 with APFloat
+// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 40d866ec7bf10..82bdfd02661a6 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,6 +12,7 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index e0cac8b699c30..d5665b439b059 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,6 +186,21 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToAPFloat
+//===----------------------------------------------------------------------===//
+
+def ArithToAPFloatConversionPass
+    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
+  let summary = "Convert Arith ops to APFloat runtime library calls";
+  let description = [{
+    This pass converts supported Arith ops to APFloat-based runtime library
+    calls (APFloatWrappers.cpp). APFloat is a software implementation of
+    floating-point arithmetic operations.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 3576126a487ac..00d50874a2e8d 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,6 +60,13 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
+/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
+/// `name`. Return a failure if the FuncOp is found but with a different
+/// signature.
+FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                               FunctionType funcT,
+                               SymbolTableCollection *symbolTables = nullptr);
+
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 8ad9ed18acebd..b09d32022e348 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,6 +52,10 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                             SymbolTableCollection *symbolTables = nullptr);
+
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
index 238fa42cae427..784bdd8e22f1f 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
@@ -109,7 +109,7 @@ def Linalg_PackOp : Linalg_RelayoutOp<"pack", [
     within [0, n).
      - The tiled dimensions (of size `inner_tiles`) are added to the end of the
      result tensor in the order in which they appear, i.e.
-     `shape(result)[rank(result) + i] = inner_tiles[i]` for `0 <= i < k`.
+     `shape(result)[rank(source) + i] = inner_tiles[i]` for `0 <= i < k`.
      - The following relationship for the tiled dimensions holds:
      `shape(result)[inner_dims_pos[i]] = shape(source)[inner_dims_pos[i]] / inner_tiles[i]`,
      where (⌈/⌉ indicates CeilDiv).
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index d90f27bd037e6..40a466beee159 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -204,8 +204,8 @@ namespace {
 
 class PyRegionIterator {
 public:
-  PyRegionIterator(PyOperationRef operation)
-      : operation(std::move(operation)) {}
+  PyRegionIterator(PyOperationRef operation, int nextIndex)
+      : operation(std::move(operation)), nextIndex(nextIndex) {}
 
   PyRegionIterator &dunderIter() { return *this; }
 
@@ -228,7 +228,7 @@ class PyRegionIterator {
 
 private:
   PyOperationRef operation;
-  int nextIndex = 0;
+  intptr_t nextIndex = 0;
 };
 
 /// Regions of an op are fixed length and indexed numerically so are represented
@@ -247,7 +247,7 @@ class PyRegionList : public Sliceable<PyRegionList, PyRegion> {
 
   PyRegionIterator dunderIter() {
     operation->checkValid();
-    return PyRegionIterator(operation);
+    return PyRegionIterator(operation, startIndex);
   }
 
   static void bindDerived(ClassTy &c) {
diff --git a/mlir/lib/Bindings/Python/NanobindUtils.h b/mlir/lib/Bindings/Python/NanobindUtils.h
index 64ea4329f65f1..658e8ad5330ef 100644
--- a/mlir/lib/Bindings/Python/NanobindUtils.h
+++ b/mlir/lib/Bindings/Python/NanobindUtils.h
@@ -395,7 +395,6 @@ class Sliceable {
   /// Hook for derived classes willing to bind more methods.
   static void bindDerived(ClassTy &) {}
 
-private:
   intptr_t startIndex;
   intptr_t length;
   intptr_t step;
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
new file mode 100644
index 0000000000000..01fd5b278aca4
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -0,0 +1,158 @@
+//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::func;
+
+static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
+                           StringRef name, FunctionType funcT, bool setPrivate,
+                           SymbolTableCollection *symbolTables = nullptr) {
+  OpBuilder::InsertionGuard g(b);
+  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
+  b.setInsertionPointToStart(&symTable->getRegion(0).front());
+  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
+  if (setPrivate)
+    funcOp.setPrivate();
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
+    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
+  }
+  return funcOp;
+}
+
+/// Helper function to look up or create the symbol for a runtime library
+/// function for a binary arithmetic operation.
+///
+/// Parameter 1: APFloat semantics
+/// Parameter 2: Left-hand side operand
+/// Parameter 3: Right-hand side operand
+///
+/// This function will return a failure if the function is found but has an
+/// unexpected signature.
+///
+static FailureOr<FuncOp>
+lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
+                       SymbolTableCollection *symbolTables = nullptr) {
+  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+  auto i64Type = IntegerType::get(symTable->getContext(), 64);
+
+  std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
+  FunctionType funcT =
+      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
+  FailureOr<FuncOp> func =
+      lookupFnDecl(symTable, funcName, funcT, symbolTables);
+  // Failed due to type mismatch.
+  if (failed(func))
+    return func;
+  // Successfully matched existing decl.
+  if (*func)
+    return *func;
+
+  return createFnDecl(b, symTable, funcName, funcT,
+                      /*setPrivate=*/true, symbolTables);
+}
+
+/// Rewrite a binary arithmetic operation to an APFloat function call.
+template <typename OpTy, const char *APFloatName>
+struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
+  BinaryArithOpToAPFloatConversion(MLIRContext *context, PatternBenefit benefit,
+                                   SymbolOpInterface symTable)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable) {};
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    FailureOr<FuncOp> fn =
+        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    auto int64Type = rewriter.getI64Type();
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    int32_t sem =
+        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+    Value semValue = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(
+        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
+namespace {
+struct ArithToAPFloatConversionPass final
+    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
+  using Base::Base;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    static const char add[] = "add";
+    static const char subtract[] = "subtract";
+    static const char multiply[] = "multiply";
+    static const char divide[] = "divide";
+    static const char remainder[] = "remainder";
+    patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp, add>,
+                 BinaryArithOpToAPFloatConversion<arith::SubFOp, subtract>,
+                 BinaryArithOpToAPFloatConversion<arith::MulFOp, multiply>,
+                 BinaryArithOpToAPFloatConversion<arith::DivFOp, divide>,
+                 BinaryArithOpToAPFloatConversion<arith::RemFOp, remainder>>(
+        context, 1, getOperation());
+    LogicalResult result = success();
+    ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
+      if (diag.getSeverity() == DiagnosticSeverity::Error) {
+        result = failure();
+      }
+      // NB: if you don't return failure, no other diag handlers will fire (see
+      // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
+      return failure();
+    });
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
+    if (failed(result))
+      return signalPassFailure();
+  }
+};
+} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
new file mode 100644
index 0000000000000..b5ec49c087163
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRArithToAPFloat
+  ArithToAPFloat.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRFuncDialect
+  MLIRFuncUtils
+  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index b6099902cc337..f2bacc3399144 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index bebf1b8fff3f9..613dc6d242ceb 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
+add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 69a317ecd101f..c747e1b59558a 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,6 +1654,20 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
+    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
+      // Print other floating-point types using the APFloat runtime library.
+      int32_t sem =
+          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+      Value semValue = LLVM::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(),
+          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+      Value floatBits =
+          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
+      printer =
+          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
+      emitCall(rewriter, loc, printer.value(),
+               ValueRange({semValue, floatBits}));
+      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index adeb50b6da628..c4e81e5dbed21 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -35,7 +35,7 @@ static Value createConst(Location loc, Type type, int value,
 }
 
 /// Create a float constant.
-static Value createFloatConst(Location loc, Type type, APFloat value,
+static Value createFloatConst(Location loc, Type type, const APFloat &value,
                               PatternRewriter &rewriter) {
   auto attr = rewriter.getFloatAttr(getElementTypeOrSelf(type), value);
   if (auto shapedTy = dyn_cast<ShapedType>(type)) {
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index b4cb0932ef631..d6dfd0229963c 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,3 +254,28 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
+
+FailureOr<func::FuncOp>
+func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                   FunctionType funcT, SymbolTableCollection *symbolTables) {
+  FuncOp func;
+  if (symbolTables) {
+    func = symbolTables->lookupSymbolIn<FuncOp>(
+        symTable, StringAttr::get(symTable->getContext(), name));
+  } else {
+    func = llvm::dyn_cast_or_null<FuncOp>(
+        SymbolTable::lookupSymbolIn(symTable, name));
+  }
+
+  if (!func)
+    return func;
+
+  mlir::FunctionType foundFuncT = func.getFunctionType();
+  // Assert the signature of the found function is same as expected
+  if (funcT != foundFuncT) {
+    return func.emitError("matched function '")
+           << name << "' but with different type: " << foundFuncT
+           << " (expected " << funcT << ")";
+  }
+  return func;
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index feaffa34897b6..160b6ae89215c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,6 +30,7 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
+static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -160,6 +161,16 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                                         SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kPrintApFloat,
+      {IntegerType::get(moduleOp->getContext(), 32),
+       IntegerType::get(moduleOp->getContext(), 64)},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
+}
+
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
new file mode 100644
index 0000000000000..0a05f7369e556
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -0,0 +1,89 @@
+//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the APFloat infrastructure to MLIR programs as a runtime
+// library. APFloat is a software implementation of floating point arithmetics.
+//
+// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
+// before calling a runtime function. If a floating-point type has less than
+// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
+// integer.
+//
+// Runtime functions receive the floating-point operands of the arithmeic
+// operation in the form of 64-bit integers, along with the APFloat semantics
+// in the form of a 32-bit integer, which will be interpreted as an
+// APFloatBase::Semantics enum value.
+//
+#include "llvm/ADT/APFloat.h"
+
+#ifdef _WIN32
+#ifndef MLIR_APFLOAT_WRAPPERS_EXPORT
+#ifdef mlir_apfloat_wrappers_EXPORTS
+// We are building this library
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllexport)
+#else
+// We are using this library
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllimport)
+#endif // mlir_apfloat_wrappers_EXPORTS
+#endif // MLIR_APFLOAT_WRAPPERS_EXPORT
+#else
+// Non-windows: use visibility attributes.
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __attribute__((visibility("default")))
+#endif // _WIN32
+
+/// Binary operations without rounding mode.
+#define APFLOAT_BINARY_OP(OP)                                                  \
+  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs);                                                               \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+/// Binary operations with rounding mode.
+#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
+  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs, ROUNDING_MODE);                                                \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+extern "C" {
+
+#define BIN_OPS_WITH_ROUNDING(X)                                               \
+  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
+  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(divide, llvm::RoundingMode::NearestTiesToEven)
+
+BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
+#undef BIN_OPS_WITH_ROUNDING
+#undef APFLOAT_BINARY_OP_ROUNDING_MODE
+
+APFLOAT_BINARY_OP(remainder)
+
+#undef APFLOAT_BINARY_OP
+
+MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  double d = x.convertToDouble();
+  fprintf(stdout, "%lg", d);
+}
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index fdeb4dacf9278..8c09e50e4de7b 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,6 +2,7 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
+  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -167,6 +168,15 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
+  add_mlir_library(mlir_apfloat_wrappers
+    SHARED
+    APFloatWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+    )
+  set_property(TARGET mlir_apfloat_wrappers PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
+
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -177,6 +187,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -191,6 +202,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 9945a711d5c74..365bfddeaab73 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include <optional>
+#include <utility>
 
 using namespace mlir;
 using namespace mlir::detail;
@@ -1051,7 +1052,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
         MLIRContext *context,
         std::function<void(Operation *)> opErasedCallback = nullptr)
         : RewriterBase(context, /*listener=*/this),
-          opErasedCallback(opErasedCallback) {}
+          opErasedCallback(std::move(opErasedCallback)) {}
 
     /// Erase the given op (unless it was already erased).
     void eraseOp(Operation *op) override {
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
new file mode 100644
index 0000000000000..797f42c37a26f
--- /dev/null
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL:   func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
+
+// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
+// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
+// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
+// CHECK:         }
+
+// Illustrate that both f8E4M3FN and f6E3M2FN calling the same _mlir_apfloat_add is fine
+// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
+// CHECK-LABEL:   func.func @full_example() {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
+// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
+// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
+// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
+//                  // fltSemantics semantics for f8E4M3FN
+// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_1:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
+// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
+// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
+
+// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
+// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
+// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
+// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
+//                  // fltSemantics semantics for f6E3M2FN
+// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
+// CHECK:           %[[VAL_3:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
+// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
+// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
+// CHECK:           return
+// CHECK:         }
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> f8E4M3FN {
+  %cst = arith.constant 2.2 : f8E4M3FN
+  return %cst : f8E4M3FN
+}
+
+func.func @bar() -> f6E3M2FN {
+  %cst = arith.constant 3.2 : f6E3M2FN
+  return %cst : f6E3M2FN
+}
+
+func.func @full_example() {
+  %a = arith.constant 1.4 : f8E4M3FN
+  %b = func.call @foo() : () -> (f8E4M3FN)
+  %c = arith.addf %a, %b : f8E4M3FN
+  vector.print %c : f8E4M3FN
+
+  %d = arith.constant 2.4 : f6E3M2FN
+  %e = func.call @bar() : () -> (f6E3M2FN)
+  %f = arith.addf %d, %e : f6E3M2FN
+  vector.print %f : f6E3M2FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// Test decl collision (different type)
+// expected-error@+1{{matched function '_mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
+func.func private @_mlir_apfloat_add(i32, i32, f32) -> index
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_subtract(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_multiply(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_divide(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_remainder(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
+  return
+}
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 22fde3b7d28a5..80886c31cb58f 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -1276,6 +1276,42 @@ func.func @test_matmul_t_block_scaled_mxint8(%arg0: tensor<4x8x32x!tosa.mxint8>,
   return %0 : tensor<4x8x16xf32>
 }
 
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e3m2_e2e
+func.func @test_matmul_t_block_scaled_fp6e3m2_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32xf6E3M2FN>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32xf6E3M2FN>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf6E3M2FN>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32xf6E3M2FN>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e2m3_e2e
+func.func @test_matmul_t_block_scaled_fp6e2m3_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32xf6E2M3FN>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32xf6E2M3FN>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf6E2M3FN>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32xf6E2M3FN>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp4e2m1_e2e
+func.func @test_matmul_t_block_scaled_fp4e2m1_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32xf4E2M1FN>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32xf4E2M1FN>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf4E2M1FN>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32xf4E2M1FN>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_mxint8_e2e
+func.func @test_matmul_t_block_scaled_mxint8_e2e(%arg0: tensor<6x2x32xf32>, %arg1: tensor<6x64x32xf32>) -> tensor<6x2x64xf32> {
+  %a, %sa = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32xf32>) -> (tensor<6x2x32x!tosa.mxint8>, tensor<6x2x1xf8E8M0FNU>)
+  %b, %sb = tosa.cast_to_block_scaled %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x64x32xf32>) -> (tensor<6x64x32x!tosa.mxint8>, tensor<6x64x1xf8E8M0FNU>)
+  %res = tosa.matmul_t_block_scaled %a, %sa, %b, %sb {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<6x2x32x!tosa.mxint8>, tensor<6x2x1xf8E8M0FNU>, tensor<6x64x32x!tosa.mxint8>, tensor<6x64x1xf8E8M0FNU>) -> tensor<6x2x64xf32>
+  return %res : tensor<6x2x64xf32>
+}
+
 // -----
 // CHECK-LABEL: test_cast_from_block_scaled_static
 func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
@@ -1307,7 +1343,7 @@ func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*
 // -----
 // CHECK-LABEL: test_cast_to_block_scaled_mxint8
 func.func @test_cast_to_block_scaled_mxint8(%arg0: tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>) {
-  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf32>) -> (tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>)
   return %0#0, %0#1 : tensor<4x32x!tosa.mxint8>, tensor<4x1xf8E8M0FNU>
 }
 
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
new file mode 100644
index 0000000000000..2768afe0834b5
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -0,0 +1,36 @@
+// Case 1: All floating-point arithmetics is lowered through APFloat.
+// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils \
+// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
+
+// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
+//         Arithmetics on f32 is lowered directly to LLVM.
+// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
+// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils \
+// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> (f8E4M3FN, f32) {
+  %cst1 = arith.constant 2.2 : f8E4M3FN
+  %cst2 = arith.constant 2.2 : f32
+  return %cst1, %cst2 : f8E4M3FN, f32
+}
+
+func.func @entry() {
+  %a1 = arith.constant 1.4 : f8E4M3FN
+  %a2 = arith.constant 1.4 : f32
+  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
+  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
+  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
+
+  // CHECK: 3.5
+  vector.print %c1 : f8E4M3FN
+
+  // CHECK: 3.6
+  vector.print %c2 : f32
+
+  return
+}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 6ff12d66523f5..4a38ed605be0c 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -208,6 +208,7 @@ def find_real_python_interpreter():
     add_runtime("mlir_c_runner_utils"),
     add_runtime("mlir_async_runtime"),
     add_runtime("mlir_float16_utils"),
+    add_runtime("mlir_apfloat_wrappers"),
     "mlir-linalg-ods-yaml-gen",
     "mlir-reduce",
     "mlir-pdll",
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index f5fa4dad856f8..1bdd345d98c05 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -2,12 +2,12 @@
 
 import gc
 import io
-import itertools
 from tempfile import NamedTemporaryFile
 from mlir.ir import *
 from mlir.dialects.builtin import ModuleOp
-from mlir.dialects import arith
+from mlir.dialects import arith, func, scf
 from mlir.dialects._ods_common import _cext
+from mlir.extras import types as T
 
 
 def run(f):
@@ -1199,3 +1199,27 @@ def testGetOwnerConcreteOpview():
             r = arith.AddIOp(a, a, overflowFlags=arith.IntegerOverflowFlags.nsw)
             for u in a.result.uses:
                 assert isinstance(u.owner, arith.AddIOp)
+
+
+# CHECK-LABEL: TEST: testIndexSwitch
+@run
+def testIndexSwitch():
+    with Context() as ctx, Location.unknown():
+        i32 = T.i32()
+        module = Module.create()
+        with InsertionPoint(module.body):
+
+            @func.FuncOp.from_py_func(T.index())
+            def index_switch(index):
+                c1 = arith.constant(i32, 1)
+                switch_op = scf.IndexSwitchOp(
+                    results_=[i32], arg=index, cases=range(3), num_caseRegions=3
+                )
+
+                assert len(switch_op.regions) == 4
+                assert len(switch_op.regions[2:]) == 2
+                assert len([i for i in switch_op.regions[2:]]) == 2
+                assert len(switch_op.caseRegions) == 3
+                assert len([i for i in switch_op.caseRegions]) == 3
+                assert len(switch_op.caseRegions[1:]) == 2
+                assert len([i for i in switch_op.caseRegions[1:]]) == 2
diff --git a/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp b/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
index 092ec2ebe81a6..33421b4e2ddd6 100644
--- a/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
+++ b/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
@@ -18,10 +18,11 @@ using namespace llvm;
 using namespace mlir;
 
 // Generator that prints records.
-GenRegistration printRecords("print-records", "Print all records to stdout",
-                             [](const RecordKeeper &records, raw_ostream &os) {
-                               os << records;
-                               return false;
-                             });
+static GenRegistration
+    printRecords("print-records", "Print all records to stdout",
+                 [](const RecordKeeper &records, raw_ostream &os) {
+                   os << records;
+                   return false;
+                 });
 
 int main(int argc, char **argv) { return MlirTblgenMain(argc, argv); }
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 7066f498c7d49..674a3ec8e18ed 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3881,6 +3881,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":DialectUtils",
+        ":GPUDialect",
         ":IR",
         ":SCFDialect",
         ":TransformDialect",