From bfd4155f234b4e7f826cb57cad7e9876acfac046 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 10 Nov 2025 20:10:40 +0800
Subject: [PATCH 01/28] [VPlan] Don't apply predication discount to
 non-originally-predicated blocks (#160449)

Split off from #158690. Currently if an instruction needs predicated due
to tail folding, it will also have a predicated discount applied to it
in multiple places.
This is likely inaccurate because we can expect a tail folded
instruction to be executed on every iteration bar the last.

This fixes it by checking if the instruction/block was originally
predicated, and in doing so prevents vectorization with tail folding
where we would have had to scalarize the memory op anyway.

On llvm-test-suite this causes 4 loops in total to no longer be
vectorized with -O3 on arm64-apple-darwin, and there's no observable
performance impact.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  44 +++-
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  19 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   2 +-
 .../AArch64/conditional-branches-cost.ll      |  66 +----
 .../AArch64/induction-costs-sve.ll            | 245 +-----------------
 .../LoopVectorize/ARM/optsize_minsize.ll      | 189 ++------------
 .../CostModel/masked-interleaved-store-i16.ll |  12 +-
 .../X86/fixed-order-recurrence.ll             |  67 +----
 ...-order-recurrence-sink-replicate-region.ll |  55 ++--
 .../vplan-sink-scalars-and-merge.ll           |   2 +-
 10 files changed, 134 insertions(+), 567 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 45b5570261416..566d6eafee63e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1232,6 +1232,30 @@ class LoopVectorizationCostModel {
   /// Superset of instructions that return true for isScalarWithPredication.
   bool isPredicatedInst(Instruction *I) const;
 
+  /// A helper function that returns how much we should divide the cost of a
+  /// predicated block by. Typically this is the reciprocal of the block
+  /// probability, i.e. if we return X we are assuming the predicated block will
+  /// execute once for every X iterations of the loop header so the block should
+  /// only contribute 1/X of its cost to the total cost calculation, but when
+  /// optimizing for code size it will just be 1 as code size costs don't depend
+  /// on execution probabilities.
+  ///
+  /// TODO: We should use actual block probability here, if available.
+  /// Currently, we always assume predicated blocks have a 50% chance of
+  /// executing, apart from blocks that are only predicated due to tail folding.
+  inline unsigned
+  getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
+                          BasicBlock *BB) const {
+    // If a block wasn't originally predicated but was predicated due to
+    // e.g. tail folding, don't divide the cost. Tail folded loops may still be
+    // predicated in the final vector loop iteration, but for most loops that
+    // don't have low trip counts we can expect their probability to be close to
+    // zero.
+    if (!Legal->blockNeedsPredication(BB))
+      return 1;
+    return CostKind == TTI::TCK_CodeSize ? 1 : 2;
+  }
+
   /// Return the costs for our two available strategies for lowering a
   /// div/rem operation which requires speculating at least one lane.
   /// First result is for scalarization (will be invalid for scalable
@@ -2887,7 +2911,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
     // likely.
-    ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
+    ScalarizationCost =
+        ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
   }
 
   InstructionCost SafeDivisorCost = 0;
@@ -5032,7 +5057,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
       }
 
     // Scale the total scalar cost by block probability.
-    ScalarCost /= getPredBlockCostDivisor(CostKind);
+    ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
 
     // Compute the discount. A non-negative discount means the vector version
     // of the instruction costs more, and scalarizing would be beneficial.
@@ -5082,10 +5107,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
     // stores and instructions that may divide by zero) will now be
     // unconditionally executed. For the scalar case, we may not always execute
     // the predicated block, if it is an if-else block. Thus, scale the block's
-    // cost by the probability of executing it. blockNeedsPredication from
-    // Legal is used so as to not include all blocks in tail folded loops.
-    if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost /= getPredBlockCostDivisor(CostKind);
+    // cost by the probability of executing it.
+    // getPredBlockCostDivisor will return 1 for blocks that are only predicated
+    // by the header mask when folding the tail.
+    if (VF.isScalar())
+      BlockCost /= getPredBlockCostDivisor(CostKind, BB);
 
     Cost += BlockCost;
   }
@@ -5164,7 +5190,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
   if (isPredicatedInst(I)) {
-    Cost /= getPredBlockCostDivisor(CostKind);
+    Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
 
     // Add the cost of an i1 extract and a branch
     auto *VecI1Ty =
@@ -6732,6 +6758,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
          SkipCostComputation.contains(UI);
 }
 
+unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
+  return CM.getPredBlockCostDivisor(CostKind, BB);
+}
+
 InstructionCost
 LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
                                           VPCostContext &CostCtx) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 965426f86ff21..caabfa7275b69 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
                        int64_t Step);
 
-/// A helper function that returns how much we should divide the cost of a
-/// predicated block by. Typically this is the reciprocal of the block
-/// probability, i.e. if we return X we are assuming the predicated block will
-/// execute once for every X iterations of the loop header so the block should
-/// only contribute 1/X of its cost to the total cost calculation, but when
-/// optimizing for code size it will just be 1 as code size costs don't depend
-/// on execution probabilities.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-///       we always assume predicated blocks have a 50% chance of executing.
-inline unsigned
-getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
-  return CostKind == TTI::TCK_CodeSize ? 1 : 2;
-}
-
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
 /// [1, 16) = {1, 2, 4, 8}
@@ -367,6 +352,10 @@ struct VPCostContext {
   /// has already been pre-computed.
   bool skipCostComputation(Instruction *UI, bool IsVector) const;
 
+  /// \returns how much the cost of a predicated block should be divided by.
+  /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor.
+  unsigned getPredBlockCostDivisor(BasicBlock *BB) const;
+
   /// Returns the OperandInfo for \p V, if it is a live-in.
   TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 80cd112dbcd8a..707886f873fba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3349,7 +3349,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
     // likely.
-    ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
+    ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent());
     return ScalarCost;
   }
   case Instruction::Load:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f16351720b20f..8d878f47d1ece 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -613,63 +613,17 @@ exit:
 define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
 ; COMMON-SAME: ptr [[DST:%.*]]) {
-; COMMON-NEXT:  [[ENTRY:.*:]]
-; COMMON-NEXT:    br label %[[VECTOR_PH:.*]]
-; COMMON:       [[VECTOR_PH]]:
-; COMMON-NEXT:    br label %[[VECTOR_BODY:.*]]
-; COMMON:       [[VECTOR_BODY]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; COMMON:       [[PRED_STORE_IF]]:
-; COMMON-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
-; COMMON-NEXT:    store i8 0, ptr [[TMP0]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; COMMON:       [[PRED_STORE_CONTINUE]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; COMMON:       [[PRED_STORE_IF1]]:
-; COMMON-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
-; COMMON-NEXT:    store i8 1, ptr [[TMP1]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; COMMON:       [[PRED_STORE_CONTINUE2]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; COMMON:       [[PRED_STORE_IF3]]:
-; COMMON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
-; COMMON-NEXT:    store i8 2, ptr [[TMP2]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
-; COMMON:       [[PRED_STORE_CONTINUE4]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
-; COMMON:       [[PRED_STORE_IF5]]:
-; COMMON-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
-; COMMON-NEXT:    store i8 3, ptr [[TMP3]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
-; COMMON:       [[PRED_STORE_CONTINUE6]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; COMMON:       [[PRED_STORE_IF7]]:
-; COMMON-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
-; COMMON-NEXT:    store i8 4, ptr [[TMP4]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
-; COMMON:       [[PRED_STORE_CONTINUE8]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; COMMON:       [[PRED_STORE_IF9]]:
-; COMMON-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
-; COMMON-NEXT:    store i8 5, ptr [[TMP5]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
-; COMMON:       [[PRED_STORE_CONTINUE10]]:
-; COMMON-NEXT:    br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
-; COMMON:       [[PRED_STORE_IF11]]:
-; COMMON-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
-; COMMON-NEXT:    store i8 6, ptr [[TMP6]], align 1
-; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
-; COMMON:       [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
-; COMMON:       [[PRED_STORE_IF13]]:
-; COMMON-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
-; COMMON-NEXT:    store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT:    br label %[[EXIT]]
+; COMMON-NEXT:  [[ENTRY:.*]]:
+; COMMON-NEXT:    br label %[[LOOP:.*]]
+; COMMON:       [[LOOP]]:
+; COMMON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; COMMON-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
+; COMMON-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; COMMON-NEXT:    store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
+; COMMON-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; COMMON-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
+; COMMON-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; COMMON:       [[EXIT]]:
-; COMMON-NEXT:    br label %[[SCALAR_PH:.*]]
-; COMMON:       [[SCALAR_PH]]:
-; COMMON-NEXT:    br label %[[EXIT1:.*]]
-; COMMON:       [[EXIT1]]:
 ; COMMON-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index cfc6cc87a2a21..4b097ba2422e4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -271,69 +271,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ;
 ; PRED-LABEL: define void @iv_trunc(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT:  [[ENTRY:.*:]]
+; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[MUL_X:%.*]] = add i32 [[X]], 1
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED:       [[VECTOR_SCEVCHECK]]:
-; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]]
-; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0
-; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 2
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; PRED:       [[VECTOR_BODY]]:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ]
-; PRED-NEXT:    [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64>
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    store i32 1, ptr [[TMP20]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; PRED:       [[PRED_STORE_IF1]]:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1
-; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]]
-; PRED-NEXT:    store i32 1, ptr [[TMP23]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; PRED:       [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP25:%.*]] = xor i1 [[TMP24]], true
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH]]:
 ; PRED-NEXT:    br label %[[FOR_BODY:.*]]
 ; PRED:       [[FOR_BODY]]:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; PRED-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
 ; PRED-NEXT:    [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]]
 ; PRED-NEXT:    [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64
@@ -341,7 +283,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    store i32 1, ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[FOR_BODY]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
@@ -437,101 +379,21 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
 ;
 ; PRED-LABEL: define void @trunc_ivs_and_store(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT:  [[ENTRY:.*:]]
-; PRED-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED:       [[VECTOR_SCEVCHECK]]:
+; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = mul i32 [[X]], [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
-; PRED-NEXT:    [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0
-; PRED-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]]
-; PRED-NEXT:    [[TMP5:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; PRED-NEXT:    [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0
-; PRED-NEXT:    [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
-; PRED-NEXT:    [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0
-; PRED-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; PRED-NEXT:    br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; PRED:       [[VECTOR_BODY]]:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
-; PRED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
-; PRED-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
-; PRED-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
-; PRED:       [[PRED_STORE_IF2]]:
-; PRED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
-; PRED-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
-; PRED-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
-; PRED-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
-; PRED:       [[PRED_STORE_CONTINUE3]]:
-; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT:    br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
-; PRED:       [[PRED_STORE_IF4]]:
-; PRED-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
-; PRED-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
-; PRED-NEXT:    [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT:    store i32 [[TMP30]], ptr [[TMP29]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
-; PRED:       [[PRED_STORE_CONTINUE5]]:
-; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT:    br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
-; PRED:       [[PRED_STORE_IF6]]:
-; PRED-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
-; PRED-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
-; PRED-NEXT:    [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT:    store i32 [[TMP34]], ptr [[TMP33]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; PRED:       [[PRED_STORE_CONTINUE7]]:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP35]], true
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH]]:
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
 ; PRED-NEXT:    [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
-; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]]
+; PRED-NEXT:    [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]]
 ; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
 ; PRED-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64
 ; PRED-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]]
 ; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; PRED-NEXT:    [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; PRED-NEXT:    br i1 [[EXITCOND_3_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
@@ -627,91 +489,12 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ;
 ; PRED-LABEL: define void @ivs_trunc_and_ext(
 ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; PRED-NEXT:  [[ENTRY:.*:]]
+; PRED-NEXT:  [[ENTRY:.*]]:
 ; PRED-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
-; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; PRED-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
-; PRED:       [[VECTOR_SCEVCHECK]]:
-; PRED-NEXT:    [[TMP1:%.*]] = sub i32 -1, [[X]]
-; PRED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0
-; PRED-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]]
-; PRED-NEXT:    [[TMP4:%.*]] = trunc i64 [[N]] to i32
-; PRED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]])
-; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; PRED-NEXT:    [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0
-; PRED-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false
-; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0
-; PRED-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]]
-; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]]
-; PRED-NEXT:    br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; PRED:       [[VECTOR_PH]]:
-; PRED-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4
-; PRED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
-; PRED:       [[VECTOR_BODY]]:
-; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
-; PRED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT:    [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT:    [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64>
-; PRED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
-; PRED-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; PRED:       [[PRED_STORE_IF]]:
-; PRED-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; PRED-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]]
-; PRED-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0
-; PRED-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; PRED:       [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT:    br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
-; PRED:       [[PRED_STORE_IF1]]:
-; PRED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; PRED-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
-; PRED-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
-; PRED-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; PRED:       [[PRED_STORE_CONTINUE2]]:
-; PRED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT:    br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
-; PRED:       [[PRED_STORE_IF3]]:
-; PRED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; PRED-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
-; PRED-NEXT:    [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
-; PRED:       [[PRED_STORE_CONTINUE4]]:
-; PRED-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT:    br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]]
-; PRED:       [[PRED_STORE_IF5]]:
-; PRED-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
-; PRED-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
-; PRED-NEXT:    [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
-; PRED:       [[PRED_STORE_CONTINUE6]]:
-; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
-; PRED-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP34]], true
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; PRED-NEXT:    br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH]]:
 ; PRED-NEXT:    br label %[[LOOP:.*]]
 ; PRED:       [[LOOP]]:
-; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; PRED-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
 ; PRED-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32
 ; PRED-NEXT:    [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]]
 ; PRED-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
@@ -720,7 +503,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
 ; PRED-NEXT:    store i32 [[IV_2]], ptr [[GEP]], align 4
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
 ; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
@@ -842,7 +625,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED:       [[PRED_STORE_CONTINUE5]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
 ; PRED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; PRED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; PRED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
 ; PRED:       [[SCALAR_PH]]:
@@ -855,7 +638,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
 ; PRED-NEXT:    [[IV_1_NEXT]] = add i32 [[IV_1]], 1
 ; PRED-NEXT:    [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64
 ; PRED-NEXT:    [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]]
-; PRED-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; PRED-NEXT:    br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; PRED:       [[EXIT]]:
 ; PRED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index 6ea075f76aed4..83be0708774f1 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -181,178 +181,23 @@ for.cond.cleanup:
 define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) {
 ; DEFAULT-LABEL: define void @tail_predicate_without_optsize(
 ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) {
-; DEFAULT-NEXT:  [[ENTRY:.*:]]
-; DEFAULT-NEXT:    br label %[[VECTOR_PH:.*]]
-; DEFAULT:       [[VECTOR_PH]]:
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
-; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ]
-; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
-; DEFAULT-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2)
-; DEFAULT-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]]
-; DEFAULT-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; DEFAULT:       [[PRED_STORE_IF]]:
-; DEFAULT-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0
-; DEFAULT-NEXT:    store i8 [[TMP11]], ptr [[TMP10]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; DEFAULT:       [[PRED_STORE_CONTINUE]]:
-; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
-; DEFAULT:       [[PRED_STORE_IF6]]:
-; DEFAULT-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
-; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]]
-; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1
-; DEFAULT-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; DEFAULT:       [[PRED_STORE_CONTINUE7]]:
-; DEFAULT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
-; DEFAULT:       [[PRED_STORE_IF8]]:
-; DEFAULT-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]]
-; DEFAULT-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2
-; DEFAULT-NEXT:    store i8 [[TMP19]], ptr [[TMP18]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
-; DEFAULT:       [[PRED_STORE_CONTINUE9]]:
-; DEFAULT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]]
-; DEFAULT:       [[PRED_STORE_IF10]]:
-; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
-; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]]
-; DEFAULT-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3
-; DEFAULT-NEXT:    store i8 [[TMP23]], ptr [[TMP22]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
-; DEFAULT:       [[PRED_STORE_CONTINUE11]]:
-; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
-; DEFAULT-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]]
-; DEFAULT:       [[PRED_STORE_IF12]]:
-; DEFAULT-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 4
-; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]]
-; DEFAULT-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4
-; DEFAULT-NEXT:    store i8 [[TMP27]], ptr [[TMP26]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE13]]
-; DEFAULT:       [[PRED_STORE_CONTINUE13]]:
-; DEFAULT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
-; DEFAULT-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]]
-; DEFAULT:       [[PRED_STORE_IF14]]:
-; DEFAULT-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 5
-; DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]]
-; DEFAULT-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5
-; DEFAULT-NEXT:    store i8 [[TMP31]], ptr [[TMP30]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE15]]
-; DEFAULT:       [[PRED_STORE_CONTINUE15]]:
-; DEFAULT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
-; DEFAULT-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]]
-; DEFAULT:       [[PRED_STORE_IF16]]:
-; DEFAULT-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 6
-; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]]
-; DEFAULT-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6
-; DEFAULT-NEXT:    store i8 [[TMP35]], ptr [[TMP34]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE17]]
-; DEFAULT:       [[PRED_STORE_CONTINUE17]]:
-; DEFAULT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
-; DEFAULT-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]]
-; DEFAULT:       [[PRED_STORE_IF18]]:
-; DEFAULT-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 7
-; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]]
-; DEFAULT-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7
-; DEFAULT-NEXT:    store i8 [[TMP39]], ptr [[TMP38]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE19]]
-; DEFAULT:       [[PRED_STORE_CONTINUE19]]:
-; DEFAULT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
-; DEFAULT-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]]
-; DEFAULT:       [[PRED_STORE_IF20]]:
-; DEFAULT-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], 8
-; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]]
-; DEFAULT-NEXT:    [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8
-; DEFAULT-NEXT:    store i8 [[TMP43]], ptr [[TMP42]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE21]]
-; DEFAULT:       [[PRED_STORE_CONTINUE21]]:
-; DEFAULT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
-; DEFAULT-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]]
-; DEFAULT:       [[PRED_STORE_IF22]]:
-; DEFAULT-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], 9
-; DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]]
-; DEFAULT-NEXT:    [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9
-; DEFAULT-NEXT:    store i8 [[TMP47]], ptr [[TMP46]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE23]]
-; DEFAULT:       [[PRED_STORE_CONTINUE23]]:
-; DEFAULT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
-; DEFAULT-NEXT:    br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]]
-; DEFAULT:       [[PRED_STORE_IF24]]:
-; DEFAULT-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX]], 10
-; DEFAULT-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]]
-; DEFAULT-NEXT:    [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10
-; DEFAULT-NEXT:    store i8 [[TMP51]], ptr [[TMP50]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE25]]
-; DEFAULT:       [[PRED_STORE_CONTINUE25]]:
-; DEFAULT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
-; DEFAULT-NEXT:    br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]]
-; DEFAULT:       [[PRED_STORE_IF26]]:
-; DEFAULT-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX]], 11
-; DEFAULT-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]]
-; DEFAULT-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11
-; DEFAULT-NEXT:    store i8 [[TMP55]], ptr [[TMP54]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE27]]
-; DEFAULT:       [[PRED_STORE_CONTINUE27]]:
-; DEFAULT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
-; DEFAULT-NEXT:    br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]]
-; DEFAULT:       [[PRED_STORE_IF28]]:
-; DEFAULT-NEXT:    [[TMP57:%.*]] = add i64 [[INDEX]], 12
-; DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]]
-; DEFAULT-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12
-; DEFAULT-NEXT:    store i8 [[TMP59]], ptr [[TMP58]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE29]]
-; DEFAULT:       [[PRED_STORE_CONTINUE29]]:
-; DEFAULT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
-; DEFAULT-NEXT:    br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]]
-; DEFAULT:       [[PRED_STORE_IF30]]:
-; DEFAULT-NEXT:    [[TMP61:%.*]] = add i64 [[INDEX]], 13
-; DEFAULT-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]]
-; DEFAULT-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13
-; DEFAULT-NEXT:    store i8 [[TMP63]], ptr [[TMP62]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE31]]
-; DEFAULT:       [[PRED_STORE_CONTINUE31]]:
-; DEFAULT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
-; DEFAULT-NEXT:    br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
-; DEFAULT:       [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT:    [[TMP65:%.*]] = add i64 [[INDEX]], 14
-; DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]]
-; DEFAULT-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14
-; DEFAULT-NEXT:    store i8 [[TMP67]], ptr [[TMP66]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
-; DEFAULT:       [[PRED_STORE_CONTINUE33]]:
-; DEFAULT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
-; DEFAULT-NEXT:    br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT:       [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT:    [[TMP69:%.*]] = add i64 [[INDEX]], 15
+; DEFAULT-NEXT:  [[ENTRY:.*]]:
+; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
+; DEFAULT:       [[FOR_BODY]]:
+; DEFAULT-NEXT:    [[TMP69:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[TMP69]] to i8
+; DEFAULT-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP0]]
+; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP0]], 1
+; DEFAULT-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
+; DEFAULT-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
+; DEFAULT-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP0]], 2
+; DEFAULT-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
+; DEFAULT-NEXT:    [[TMP71:%.*]] = add i8 [[ADD]], [[MUL9]]
 ; DEFAULT-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]]
-; DEFAULT-NEXT:    [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
-; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
-; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
-; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; DEFAULT:       [[MIDDLE_BLOCK]]:
-; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP69]], 1
+; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15
+; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -449,7 +294,7 @@ define void @dont_vectorize_with_minsize() {
 ; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
@@ -555,7 +400,7 @@ define void @vectorization_forced() {
 ; DEFAULT-NEXT:    store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; DEFAULT-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
index a286df9bc2fc7..c2c04ce6f5ff5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
@@ -85,13 +85,13 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 21 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 43 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 71 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test2'
@@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index cc84fabd00ecc..002d811d46992 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -435,67 +435,16 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3
 ; CHECK-LABEL: @test_first_order_recurrence_tried_to_scalarized(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[N:%.*]] = select i1 [[C:%.*]], i32 8, i32 9
-; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 4>, [[VECTOR_PH]] ], [ [[VEC_IND]], [[PRED_STORE_CONTINUE6]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw i32 10, [[TMP5]]
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP4]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; CHECK:       pred.store.if1:
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw i32 10, [[TMP10]]
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.continue2:
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; CHECK:       pred.store.if3:
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ 4, [[ENTRY]] ], [ [[TMP18]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[TMP18]], 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = sub nsw i32 10, [[TMP15]]
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.if5:
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP21:%.*]] = sub nsw i32 10, [[TMP20]]
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP18]]
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index fe230fa6c9090..b72cbd333cb79 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -49,6 +49,8 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT: loop.0:
 ; CHECK-NEXT:   WIDEN-CAST ir<%conv> = sext vp<[[PRED1]]> to i32
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv>
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT:   WIDEN ir<%add> = add ir<%conv>, ir<%rem>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -57,9 +59,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%conv>, ir<%rem>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep.dst>
 ; CHECK-NEXT:   Successor(s): pred.store.continue
 ; CHECK-EMPTY:
@@ -293,27 +293,44 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr
 ; CHECK-NEXT: loop.0:
 ; CHECK-NEXT:   WIDEN-CAST ir<%conv> = sext vp<[[PRED]]> to i32
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv>
-; CHECK-NEXT: Successor(s): pred.store
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT: Successor(s): pred.load
 ; CHECK-EMPTY:
-; CHECK:      <xVFxUF> pred.store: {
-; CHECK-NEXT:   pred.store.entry:
+; CHECK:      <xVFxUF> pred.load: {
+; CHECK-NEXT:   pred.load.entry:
 ; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK]]>
-; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
+; CHECK-NEXT:   Successor(s): pred.load.if, pred.load.continue
 ; CHECK-EMPTY:
-; CHECK:        pred.store.if:
-; CHECK-NEXT:     REPLICATE ir<%lv.2> = load ir<%gep>
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
-; CHECK-NEXT:     REPLICATE ir<%conv.lv.2> = sext ir<%lv.2>
-; CHECK-NEXT:     REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem>
-; CHECK-NEXT:     REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
-; CHECK-NEXT:     REPLICATE ir<%add> = add ir<%add.1>, ir<%conv.lv.2>
-; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep.dst>
-; CHECK-NEXT:   Successor(s): pred.store.continue
+; CHECK:        pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%lv.2> = load ir<%gep> (S->V)
+; CHECK-NEXT:   Successor(s): pred.load.continue
 ; CHECK-EMPTY:
-; CHECK:        pred.store.continue:
+; CHECK:        pred.load.continue:
+; CHECK-NEXT:     PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.2>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
-; CHECK-NEXT:   Successor(s): loop.2
+; CHECK-NEXT:   Successor(s): loop.1
+; CHECK-EMPTY:
+; CHECK-NEXT:  loop.1:
+; CHECK-NEXT:    WIDEN ir<%add.1> = add ir<%conv>, ir<%rem>
+; CHECK-NEXT:    WIDEN-CAST ir<%conv.lv.2> = sext vp<%9> to i32
+; CHECK-NEXT:    WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2>
+; CHECK-NEXT:  Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.store: {
+; CHECK-NEXT:    pred.store.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<[[MASK]]>
+; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.if:
+; CHECK-NEXT:      REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]>
+; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%gep.dst>
+; CHECK-NEXT:    Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.continue:
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): loop.2
 ; CHECK-EMPTY:
 ; CHECK:      loop.2:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
@@ -377,6 +394,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-NEXT:   ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
 ; CHECK-NEXT:   EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT:   WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
@@ -386,7 +404,6 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
 ; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:     REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:     REPLICATE ir<%rem.div> = sdiv ir<20>, ir<%rem>
 ; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
 ; CHECK-NEXT:     REPLICATE store ir<%rem.div>, ir<%gep>
@@ -457,6 +474,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT:     EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]>, vp<[[BTC]]>
 ; CHECK-NEXT:     CLONE ir<[[L]]> = load ir<%src>
 ; CHECK-NEXT:     EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn>, ir<[[L]]>
+; CHECK-NEXT:     WIDEN ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:  Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   <xVFxUF> pred.store: {
@@ -467,7 +485,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias
 ; CHECK-NEXT:     pred.store.if:
 ; CHECK-NEXT:       vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:       REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]>
-; CHECK-NEXT:       REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x>
 ; CHECK-NEXT:       REPLICATE store ir<%val>, ir<%gep.dst>
 ; CHECK-NEXT:     Successor(s): pred.store.continue
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 2dd6a04ee7d4a..3161a0d5e6f5e 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -debug -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -debug -disable-output %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

From e2a2c03eefc0f0d2844065949591acb5bafc69b3 Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Mon, 10 Nov 2025 13:13:49 +0100
Subject: [PATCH 02/28] [DebugInfo] Add Verifier check for incorrectly-scoped
 retainedNodes (#166855)

These checks ensure that retained nodes of a DISubprogram belong to the
subprogram.

Tests with incorrect IR are fixed. We should not have variables of one subprogram present in retained nodes of other subprograms.

Also, interface for accessing DISubprogram's retained nodes is slightly
refactored. `DISubprogram::visitRetainedNodes` and
`DISubprogram::forEachRetainedNode` are added to avoid repeating checks
like
```
if (const auto *LV = dyn_cast<DILocalVariable>(N))
  ...
else if (const auto *L = dyn_cast<DILabel>(N))
  ...
else if (const auto *IE = dyn_cast<DIImportedEntity>(N))
  ...
```
---
 llvm/include/llvm/IR/DebugInfo.h              |  4 +--
 llvm/include/llvm/IR/DebugInfoMetadata.h      | 33 +++++++++++++++++++
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    | 12 +------
 llvm/lib/IR/DebugInfo.cpp                     | 14 ++++----
 llvm/lib/IR/DebugInfoMetadata.cpp             | 13 ++++++++
 llvm/lib/IR/Verifier.cpp                      | 20 +++++++++--
 .../X86/StackColoring-dbg-invariance.mir      |  2 +-
 .../DebugInfo/MIR/X86/clobbered-fragments.mir |  3 +-
 llvm/test/DebugInfo/MIR/X86/machine-cse.mir   |  5 +--
 .../MIR/X86/remove-redundant-dbg-vals.mir     | 10 +++---
 .../DebugInfo/X86/instr-ref-selectiondag.ll   | 12 ++++---
 .../X86/live-debug-values-constprop.mir       |  9 +++--
 .../X86/live-debug-values-remove-range.ll     |  2 +-
 .../X86/live-debug-vars-intervals.mir         |  2 +-
 llvm/test/Linker/thinlto_funcimport_debug.ll  |  4 +--
 .../CodeExtractor/PartialInlineDebug.ll       |  6 ++--
 .../HotColdSplit/split-out-dbg-label.ll       |  2 +-
 .../HotColdSplit/transfer-debug-info.ll       |  2 +-
 .../Transforms/InstCombine/debuginfo-dce.ll   |  8 ++---
 .../LCSSA/rewrite-existing-dbg-values.ll      |  3 +-
 .../test/Transforms/LoopVectorize/debugloc.ll |  4 +--
 .../LowerMatrixIntrinsics/remarks-inlining.ll | 10 +++---
 .../LowerMatrixIntrinsics/remarks.ll          | 16 ++++-----
 .../Util/annotation-remarks-dbg-info.ll       |  2 +-
 24 files changed, 129 insertions(+), 69 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 33e6df0ecb873..862293c9666a7 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -108,7 +108,7 @@ class DebugInfoFinder {
   LLVM_ABI void processInstruction(const Module &M, const Instruction &I);
 
   /// Process a DILocalVariable.
-  LLVM_ABI void processVariable(DILocalVariable *DVI);
+  LLVM_ABI void processVariable(const DILocalVariable *DVI);
   /// Process debug info location.
   LLVM_ABI void processLocation(const Module &M, const DILocation *Loc);
   /// Process a DbgRecord.
@@ -124,7 +124,7 @@ class DebugInfoFinder {
   void processCompileUnit(DICompileUnit *CU);
   void processScope(DIScope *Scope);
   void processType(DIType *DT);
-  void processImportedEntity(DIImportedEntity *Import);
+  void processImportedEntity(const DIImportedEntity *Import);
   bool addCompileUnit(DICompileUnit *CU);
   bool addGlobalVariable(DIGlobalVariableExpression *DIG);
   bool addScope(DIScope *Scope);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 7ade6b8e13308..6918b21d5b363 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -2554,6 +2554,39 @@ class DISubprogram : public DILocalScope {
     replaceOperandWith(7, N.get());
   }
 
+  /// For the given retained node of DISubprogram, applies one of the
+  /// given functions depending on the type of the node.
+  template <typename T, typename FuncLVT, typename FuncLabelT,
+            typename FuncImportedEntityT, typename FuncUnknownT>
+  static T
+  visitRetainedNode(const Metadata *N, FuncLVT &&FuncLV, FuncLabelT &&FuncLabel,
+                    FuncImportedEntityT &&FuncIE, FuncUnknownT &&FuncUnknown) {
+    if (const auto *LV = dyn_cast<DILocalVariable>(N))
+      return FuncLV(LV);
+    if (const auto *L = dyn_cast<DILabel>(N))
+      return FuncLabel(L);
+    if (const auto *IE = dyn_cast<DIImportedEntity>(N))
+      return FuncIE(IE);
+    return FuncUnknown(N);
+  }
+
+  /// Returns the scope of subprogram's retainedNodes.
+  static const DILocalScope *getRetainedNodeScope(const MDNode *N);
+  // For use in Verifier.
+  static const DIScope *getRawRetainedNodeScope(const MDNode *N);
+
+  /// For each retained node, applies one of the given functions depending
+  /// on the type of a node.
+  template <typename FuncLVT, typename FuncLabelT, typename FuncImportedEntityT>
+  void forEachRetainedNode(FuncLVT &&FuncLV, FuncLabelT &&FuncLabel,
+                           FuncImportedEntityT &&FuncIE) const {
+    for (MDNode *N : getRetainedNodes())
+      visitRetainedNode<void>(N, FuncLV, FuncLabel, FuncIE,
+                              [](const Metadata *N) {
+                                llvm_unreachable("Unexpected retained node!");
+                              });
+  }
+
   /// Check if this subprogram describes the given function.
   ///
   /// FIXME: Should this be looking through bitcasts?
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 567acf75d1b8d..fde48a30a8203 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1544,18 +1544,8 @@ void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
 }
 
 static const DILocalScope *getRetainedNodeScope(const MDNode *N) {
-  const DIScope *S;
-  if (const auto *LV = dyn_cast<DILocalVariable>(N))
-    S = LV->getScope();
-  else if (const auto *L = dyn_cast<DILabel>(N))
-    S = L->getScope();
-  else if (const auto *IE = dyn_cast<DIImportedEntity>(N))
-    S = IE->getScope();
-  else
-    llvm_unreachable("Unexpected retained node!");
-
   // Ensure the scope is not a DILexicalBlockFile.
-  return cast<DILocalScope>(S)->getNonLexicalBlockFileScope();
+  return DISubprogram::getRetainedNodeScope(N)->getNonLexicalBlockFileScope();
 }
 
 // Collect variable information from side table maintained by MF.
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 58836068a4929..4c5fb742aa3eb 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -247,7 +247,7 @@ void DebugInfoFinder::processType(DIType *DT) {
   }
 }
 
-void DebugInfoFinder::processImportedEntity(DIImportedEntity *Import) {
+void DebugInfoFinder::processImportedEntity(const DIImportedEntity *Import) {
   auto *Entity = Import->getEntity();
   if (auto *T = dyn_cast<DIType>(Entity))
     processType(T);
@@ -307,15 +307,13 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
     }
   }
 
-  for (auto *N : SP->getRetainedNodes()) {
-    if (auto *Var = dyn_cast_or_null<DILocalVariable>(N))
-      processVariable(Var);
-    else if (auto *Import = dyn_cast_or_null<DIImportedEntity>(N))
-      processImportedEntity(Import);
-  }
+  SP->forEachRetainedNode(
+      [this](const DILocalVariable *LV) { processVariable(LV); },
+      [](const DILabel *L) {},
+      [this](const DIImportedEntity *IE) { processImportedEntity(IE); });
 }
 
-void DebugInfoFinder::processVariable(DILocalVariable *DV) {
+void DebugInfoFinder::processVariable(const DILocalVariable *DV) {
   if (!NodesSeen.insert(DV).second)
     return;
   processScope(DV->getScope());
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index a98e925b6f1d8..1a6a25e161803 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -1441,6 +1441,19 @@ bool DISubprogram::describes(const Function *F) const {
   assert(F && "Invalid function");
   return F->getSubprogram() == this;
 }
+
+const DIScope *DISubprogram::getRawRetainedNodeScope(const MDNode *N) {
+  return visitRetainedNode<DIScope *>(
+      N, [](const DILocalVariable *LV) { return LV->getScope(); },
+      [](const DILabel *L) { return L->getScope(); },
+      [](const DIImportedEntity *IE) { return IE->getScope(); },
+      [](const Metadata *N) { return nullptr; });
+}
+
+const DILocalScope *DISubprogram::getRetainedNodeScope(const MDNode *N) {
+  return cast<DILocalScope>(getRawRetainedNodeScope(N));
+}
+
 DILexicalBlockBase::DILexicalBlockBase(LLVMContext &C, unsigned ID,
                                        StorageType Storage,
                                        ArrayRef<Metadata *> Ops)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 59eb870798235..a4f647409094c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1559,11 +1559,27 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
     auto *Node = dyn_cast<MDTuple>(RawNode);
     CheckDI(Node, "invalid retained nodes list", &N, RawNode);
     for (Metadata *Op : Node->operands()) {
-      CheckDI(Op && (isa<DILocalVariable>(Op) || isa<DILabel>(Op) ||
-                     isa<DIImportedEntity>(Op)),
+      CheckDI(Op, "nullptr in retained nodes", &N, Node);
+
+      auto True = [](const Metadata *) { return true; };
+      auto False = [](const Metadata *) { return false; };
+      bool IsTypeCorrect =
+          DISubprogram::visitRetainedNode<bool>(Op, True, True, True, False);
+      CheckDI(IsTypeCorrect,
               "invalid retained nodes, expected DILocalVariable, DILabel or "
               "DIImportedEntity",
               &N, Node, Op);
+
+      auto *RetainedNode = cast<DINode>(Op);
+      auto *RetainedNodeScope = dyn_cast_or_null<DILocalScope>(
+          DISubprogram::getRawRetainedNodeScope(RetainedNode));
+      CheckDI(RetainedNodeScope,
+              "invalid retained nodes, retained node is not local", &N, Node,
+              RetainedNode);
+      CheckDI(
+          RetainedNodeScope->getSubprogram() == &N,
+          "invalid retained nodes, retained node does not belong to subprogram",
+          &N, Node, RetainedNode, RetainedNodeScope);
     }
   }
   CheckDI(!hasConflictingReferenceFlags(N.getFlags()),
diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
index 348a2901ff6a4..24453066f2583 100644
--- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
@@ -55,7 +55,7 @@
   !9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10)
   !10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
   !11 = !DILocation(line: 4, column: 1, scope: !5)
-  !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+  !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7)
 
 ...
 ---
diff --git a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir
index a334e99b9cade..ea01835cae1e5 100644
--- a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir
+++ b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir
@@ -85,10 +85,11 @@
   !15 = !DISubrange(count: 3)
   !16 = !DILocation(line: 8, scope: !8)
   !17 = !DILocation(line: 9, scope: !8)
-  !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !11)
+  !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !22)
   !19 = !DILocalVariable(name: "local", scope: !18, file: !2, line: 8, type: !13)
   !20 = !DILocation(line: 15, scope: !18)
   !21 = !DILocation(line: 16, scope: !18)
+  !22 = !{!19}
 
 ...
 ---
diff --git a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
index c38c0a1a79f75..63dc44fb705fe 100644
--- a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
+++ b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir
@@ -73,13 +73,14 @@
   !0 = !{i32 2, !"Debug Info Version", i32 3}
   !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug)
   !2 = !DIFile(filename: "bees.cpp", directory: "")
-  !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8)
-  !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8)
+  !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9)
+  !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9)
   !4 = !DILocalVariable(name: "bees", scope: !3, type: !5)
   !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64)
   !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
   !7 = !DILocation(line: 0, scope: !3)
   !8 = !{!4}
+  !9 = !{}
 
 
   ; CHECK: ![[METAVAR:[0-9]+]] = !DILocalVariable(name: "bees",
diff --git a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir
index 06ce18d8edaa7..28fc044e606b5 100644
--- a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir
+++ b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir
@@ -139,15 +139,15 @@
   !23 = !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !24, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
   !24 = !DISubroutineType(types: !25)
   !25 = !{null, !11}
-  !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !27 = !DILocation(line: 0, scope: !26)
-  !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !29 = !DILocation(line: 0, scope: !28)
-  !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !31 = !DILocation(line: 0, scope: !30)
-  !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !33 = !DILocation(line: 0, scope: !32)
-  !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+  !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
   !35 = !DILocation(line: 0, scope: !34)
 
 ...
diff --git a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll
index dbbef2b39587d..594607c6e95d8 100644
--- a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll
+++ b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll
@@ -281,15 +281,19 @@ lala:
 !11 = !{!13}
 !13 = !DILocalVariable(name: "baz", scope: !7, file: !1, line: 6, type: !10)
 !14 = !DILocation(line: 1, scope: !7)
-!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !23)
 !21 = !DILocalVariable(name: "xyzzy", scope: !20, file: !1, line: 6, type: !10)
 !22 = !DILocation(line: 1, scope: !20)
-!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!23 = !{!21}
+!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !33)
 !31 = !DILocalVariable(name: "xyzzy", scope: !30, file: !1, line: 6, type: !10)
 !32 = !DILocation(line: 1, scope: !30)
-!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!33 = !{!31}
+!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !46)
 !41 = !DILocalVariable(name: "socks", scope: !40, file: !1, line: 6, type: !10)
 !42 = !DILocation(line: 1, scope: !40)
-!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !47)
 !44 = !DILocation(line: 0, scope: !43, inlinedAt: !42)
 !45 = !DILocalVariable(name: "knees", scope: !43, file: !1, line: 6, type: !10)
+!46 = !{!41}
+!47 = !{!45}
diff --git a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir
index 8a0537658c9c0..2900f0bdcf864 100644
--- a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir
+++ b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir
@@ -82,15 +82,18 @@
   !14 = !DISubroutineType(types: !15)
   !15 = !{!16}
   !16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-  !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true)
+  !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !43, type: !14, isDefinition: true)
   !41 = !DILocalVariable(name: "towel", scope: !40, file: !2, line: 1, type: !16)
   !42 = !DILocation(line: 40, scope: !40)
-  !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true)
+  !43 = !{!41}
+  !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !83, type: !14, isDefinition: true)
   !81 = !DILocalVariable(name: "socks", scope: !80, file: !2, line: 1, type: !16)
   !82 = !DILocation(line: 40, scope: !80)
-  !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true)
+  !83 = !{!81}
+  !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !123, type: !14, isDefinition: true)
   !121 = !DILocalVariable(name: "shoes", scope: !120, file: !2, line: 1, type: !16)
   !122 = !DILocation(line: 40, scope: !120)
+  !123 = !{!121}
 
 ...
 ---
diff --git a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll
index e656c6237c068..145b5045687cf 100644
--- a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll
+++ b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll
@@ -108,6 +108,6 @@ exit:
 !106 = !DILocation(line: 1, scope: !104)
 !113 = !{!103}
 !203 = !DILocalVariable(name: "teacake", scope: !204, file: !2, line: 1, type: !16)
-!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !113, type: !14, isDefinition: true)
+!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !213, type: !14, isDefinition: true)
 !206 = !DILocation(line: 1, scope: !204)
 !213 = !{!203}
diff --git a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir
index 3beaf8996e4f0..ab57a9612702f 100644
--- a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir
+++ b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir
@@ -91,7 +91,7 @@
   !10 = !{!11}
   !11 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !1, line: 3, type: !9)
   !12 = !DILocation(line: 3, column: 12, scope: !6)
-  !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !10)
+  !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !{!14})
   !14 = !DILocalVariable(name: "x", arg: 1, scope: !13, file: !1, line: 21, type: !9)
   !15 = !DILocation(line: 23, column: 12, scope: !13)
 
diff --git a/llvm/test/Linker/thinlto_funcimport_debug.ll b/llvm/test/Linker/thinlto_funcimport_debug.ll
index 294b3a773ef51..4454a56c40ef7 100644
--- a/llvm/test/Linker/thinlto_funcimport_debug.ll
+++ b/llvm/test/Linker/thinlto_funcimport_debug.ll
@@ -80,8 +80,8 @@ attributes #1 = { nounwind readnone }
 !26 = !DILocation(line: 9, column: 3, scope: !4)
 !27 = distinct !DISubprogram(name: "func3", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !28)
 !28 = !{!29}
-!29 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7)
+!29 = !DILocalVariable(name: "n", arg: 1, scope: !27, file: !1, line: 8, type: !33)
 !30 = distinct !DISubprogram(name: "func4", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !31)
 !31 = !{!32}
 !32 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7)
-
+!33 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", scope: !30, file: !1, line: 13, baseType: !7)
diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll
index eb2fb4f4774d8..ab01bbf20de71 100644
--- a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll
+++ b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll
@@ -96,11 +96,11 @@ entry:
 !13 = !DILocalVariable(name: "v", arg: 1, scope: !8, file: !1, line: 3, type: !11)
 !14 = !DILocation(line: 5, column: 10, scope: !8)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 9, column: 7)
-!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !17 = !DILocation(line: 10, column: 7, scope: !15)
-!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !19 = distinct !DILexicalBlock(scope: !18, file: !1, line: 100, column: 1)
 !20 = !DILocation(line: 110, column: 17, scope: !19)
-!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !22 = !DILocation(line: 110, column: 17, scope: !21)
 !23 = !DILocation(line: 15, column: 7, scope: !15)
diff --git a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll
index da6c19d604c7c..76406ddea6b9f 100644
--- a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll
+++ b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll
@@ -66,7 +66,7 @@ define void @inline_me() !dbg !13 {
 !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
 !11 = !DILocation(line: 1, column: 1, scope: !6)
 !12 = !DILabel(scope: !6, name: "bye", file: !1, line: 28)
-!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2)
 !14 = !DILabel(scope: !13, name: "label_in_@inline_me", file: !1, line: 29)
 !15 = !DILocation(line: 2, column: 2, scope: !13, inlinedAt: !11)
 !16 = !DILabel(scope: !17, name: "scoped_label_in_foo", file: !1, line: 30)
diff --git a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll
index 3f69f0c200dad..f9dd9eaf01422 100644
--- a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll
+++ b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll
@@ -106,7 +106,7 @@ define void @inline_me() !dbg !12{
 !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
 !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
 !11 = !DILocation(line: 1, column: 1, scope: !6)
-!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2)
 !13 = !DILocation(line: 2, column: 2, scope: !12, inlinedAt: !14)
 !14 = !DILocation(line: 3, column: 3, scope: !15)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 4)
diff --git a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
index c1d7c30e936f2..ec90779d0acce 100644
--- a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
+++ b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll
@@ -125,15 +125,15 @@ attributes #1 = { nounwind readnone }
 !19 = !DILocation(line: 6, column: 17, scope: !14)
 !20 = !DIExpression(DW_OP_plus_uconst, 0)
 !21 = !DILocation(line: 11, column: 1, scope: !14)
-!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !23 = !DILocation(line: 6, column: 17, scope: !22)
 !24 = !DILocalVariable(name: "entry", scope: !22, file: !1, line: 6, type: !4)
-!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !26 = !DILocation(line: 6, column: 17, scope: !25)
 !27 = !DILocalVariable(name: "entry", scope: !25, file: !1, line: 6, type: !4)
-!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !29 = !DILocation(line: 6, column: 17, scope: !28)
 !30 = !DILocalVariable(name: "entry", scope: !28, file: !1, line: 6, type: !4)
-!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !32 = !DILocation(line: 6, column: 17, scope: !31)
 !33 = !DILocalVariable(name: "entry", scope: !31, file: !1, line: 6, type: !4)
diff --git a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
index 437e56665d53b..fa8357505e7e9 100644
--- a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
+++ b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
@@ -131,7 +131,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0)
 !11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 !12 = !DILocation(line: 0, scope: !10)
-!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
 !14 = !DILocation(line: 0, scope: !15)
 !15 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 0)
 !16 = !DILocalVariable(name: "sum2", scope: !15, file: !1, line: 11, type: !11)
+!17 = !{!16}
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll
index 40cd6b63ca8f6..03e0853d29075 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll
@@ -253,10 +253,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !32 = distinct !DILexicalBlock(scope: !31, file: !5, line: 137, column: 2)
 !33 = !DILocation(line: 210, column: 44, scope: !32)
 !34 = !DILocation(line: 320, column: 44, scope: !32)
-!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12)
+!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2)
 !36 = distinct !DILexicalBlock(scope: !35, file: !5, line: 137, column: 2)
 !37 = !DILocation(line: 430, column: 44, scope: !36)
 !38 = !DILocation(line: 540, column: 44, scope: !36)
-!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12)
+!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2)
 !40 = distinct !DILexicalBlock(scope: !39, file: !5, line: 137, column: 2)
 !41 = !DILocation(line: 650, column: 44, scope: !40)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
index aaabd18958fae..618ec86ebd35d 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
@@ -118,18 +118,18 @@ declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
 !4 = !{i32 2, !"Debug Info Version", i32 3}
 !5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
 !17 = !DIFile(filename: "toplevel.c", directory: "/test")
-!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !18 = !DIFile(filename: "assign.h", directory: "/test")
-!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 !20 = !DIFile(filename: "add.h", directory: "/test")
-!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 !22 = !DIFile(filename: "store.h", directory: "/test")
-!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 !24 = !DIFile(filename: "transpose.h", directory: "/test")
-!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 
 
 !6 = !DISubroutineType(types: !7)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
index 628ff08b81679..ff41c57055bff 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -163,26 +163,26 @@ declare void @llvm.matrix.column.major.store(<9 x double>, ptr, i64, i1, i32, i3
 !19 = !DILocation(line: 10, column: 20, scope: !5)
 !20 = !DILocation(line: 10, column: 10, scope: !5)
 
-!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !22 = !DILocation(line: 30, column: 20, scope: !21)
 
-!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !24 = !DILocation(line: 40, column: 20, scope: !23)
 
-!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !26 = !DILocation(line: 50, column: 20, scope: !25)
 
-!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !28 = !DILocation(line: 60, column: 20, scope: !27)
 
-!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !30 = !DILocation(line: 70, column: 20, scope: !29)
 
-!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !32 = !DILocation(line: 80, column: 20, scope: !31)
 
-!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !34 = !DILocation(line: 90, column: 20, scope: !33)
 
-!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
 !36 = !DILocation(line: 100, column: 20, scope: !35)
diff --git a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll
index a0fa79aa7edbe..7fc72077ee5b3 100644
--- a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll
+++ b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll
@@ -72,5 +72,5 @@ entry:
 !14 = !{!15}
 !15 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 1, type: !10)
 !16 = !DILocation(line: 400, column: 3, scope: !7)
-!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14)
+!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
 !18 = !DILocation(line: 200, column: 3, scope: !17)

From 28d9f99a27aed5c431687cf720e20d2c2367d532 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@mozilla.com>
Date: Mon, 10 Nov 2025 13:17:12 +0100
Subject: [PATCH 03/28] Remove unused standard headers: <string>, <optional>,
 <numeric>, <tuple> (#167232)

---
 llvm/include/llvm/Bitcode/BitcodeWriter.h                    | 1 -
 llvm/include/llvm/DebugInfo/GSYM/GsymContext.h               | 1 -
 llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h | 1 -
 llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h      | 3 ++-
 llvm/include/llvm/MC/DXContainerPSVInfo.h                    | 1 -
 llvm/include/llvm/MC/MCAssembler.h                           | 1 -
 llvm/include/llvm/Remarks/YAMLRemarkSerializer.h             | 1 -
 llvm/include/llvm/Support/FormatVariadic.h                   | 1 -
 llvm/include/llvm/Support/Jobserver.h                        | 1 -
 llvm/include/llvm/Transforms/Scalar/JumpThreading.h          | 1 -
 llvm/include/llvm/Transforms/Scalar/Scalarizer.h             | 1 -
 llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h   | 1 -
 llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h   | 1 -
 llvm/lib/Analysis/DXILResource.cpp                           | 1 -
 llvm/lib/Analysis/TFLiteUtils.cpp                            | 1 -
 llvm/lib/Analysis/TrainingLogger.cpp                         | 1 -
 llvm/lib/CodeGen/TargetSchedule.cpp                          | 1 -
 llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp         | 1 -
 llvm/lib/Frontend/Offloading/OffloadWrapper.cpp              | 1 -
 llvm/lib/Support/Mustache.cpp                                | 1 -
 llvm/lib/Support/Windows/Program.inc                         | 1 -
 llvm/lib/Target/AArch64/AArch64TargetMachine.cpp             | 1 -
 llvm/lib/Target/AVR/AVRTargetTransformInfo.h                 | 1 -
 llvm/lib/Target/DirectX/DXContainerGlobals.cpp               | 1 -
 llvm/lib/Target/DirectX/DXILRootSignature.h                  | 1 -
 llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h       | 1 -
 llvm/lib/Target/M68k/M68kSubtarget.h                         | 2 --
 llvm/lib/Target/PowerPC/PPCSubtarget.h                       | 1 -
 llvm/lib/Target/Sparc/SparcSubtarget.h                       | 1 -
 llvm/lib/Target/X86/X86TargetMachine.cpp                     | 1 -
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp                  | 1 -
 llvm/tools/llvm-gpu-loader/amdhsa.cpp                        | 1 -
 llvm/tools/llvm-mc/Disassembler.h                            | 2 --
 llvm/tools/llvm-xray/trie-node.h                             | 1 -
 llvm/unittests/ADT/STLForwardCompatTest.cpp                  | 1 -
 llvm/unittests/ADT/SequenceTest.cpp                          | 4 +++-
 llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp   | 1 -
 llvm/unittests/IR/VFABIDemanglerTest.cpp                     | 1 -
 llvm/utils/TableGen/Common/CodeGenHwModes.h                  | 1 -
 llvm/utils/TableGen/Common/CodeGenTarget.h                   | 1 -
 llvm/utils/TableGen/TableGenBackends.h                       | 2 --
 41 files changed, 5 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index 1e72e847137a3..d88e261f8c684 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -20,7 +20,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
index e3e9b2bb91e8a..f9382fa8d9577 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
@@ -12,7 +12,6 @@
 #include "llvm/DebugInfo/DIContext.h"
 #include <cstdint>
 #include <memory>
-#include <string>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index be8cb927c26df..fc01afc6d8739 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -45,7 +45,6 @@
 #include <functional>
 #include <iterator>
 #include <memory>
-#include <optional>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h
index 535635a9ad9b0..fcfb2db85a880 100644
--- a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h
+++ b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h
@@ -21,7 +21,8 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
-#include <tuple> // for std::pair
+
+#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/MC/DXContainerPSVInfo.h b/llvm/include/llvm/MC/DXContainerPSVInfo.h
index 3a2d2949d0223..eb6d9e14d92c3 100644
--- a/llvm/include/llvm/MC/DXContainerPSVInfo.h
+++ b/llvm/include/llvm/MC/DXContainerPSVInfo.h
@@ -17,7 +17,6 @@
 #include "llvm/TargetParser/Triple.h"
 
 #include <array>
-#include <numeric>
 #include <stdint.h>
 
 namespace llvm {
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 152b81e284c1a..dbae271a1c198 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -24,7 +24,6 @@
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <tuple>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
index 69b8f9f000e1d..af9d809833023 100644
--- a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
+++ b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h
@@ -16,7 +16,6 @@
 #include "llvm/Remarks/RemarkSerializer.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
-#include <optional>
 
 namespace llvm {
 namespace remarks {
diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
index 85652924491ba..fdd448f7b5a3a 100644
--- a/llvm/include/llvm/Support/FormatVariadic.h
+++ b/llvm/include/llvm/Support/FormatVariadic.h
@@ -37,7 +37,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <array>
 #include <cstddef>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h
index 6bee3b5671d55..3c0c04537735d 100644
--- a/llvm/include/llvm/Support/Jobserver.h
+++ b/llvm/include/llvm/Support/Jobserver.h
@@ -68,7 +68,6 @@
 
 #include "llvm/ADT/StringRef.h"
 #include <memory>
-#include <string>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index a03a38466b27b..1a19eb94e60ea 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -24,7 +24,6 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include <optional>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
index 12513c2a704f2..35c9adbe17677 100644
--- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h
@@ -20,7 +20,6 @@
 
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
-#include <optional>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
index cb48bb01e178a..19b573d6546a0 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
@@ -14,7 +14,6 @@
 #define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H
 
 #include <cstdint>
-#include <optional>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h
index cfcd1611e27fe..47aa2ff5930b0 100644
--- a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h
+++ b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <optional>
-#include <string>
 
 namespace llvm {
 
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index 27114e0705a1d..033f516abe017 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/DXILABI.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
-#include <optional>
 
 #define DEBUG_TYPE "dxil-resource"
 
diff --git a/llvm/lib/Analysis/TFLiteUtils.cpp b/llvm/lib/Analysis/TFLiteUtils.cpp
index 2762e22f28cef..fcef1c8aa7380 100644
--- a/llvm/lib/Analysis/TFLiteUtils.cpp
+++ b/llvm/lib/Analysis/TFLiteUtils.cpp
@@ -30,7 +30,6 @@
 #include "tensorflow/lite/logger.h"
 
 #include <cassert>
-#include <numeric>
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/TrainingLogger.cpp b/llvm/lib/Analysis/TrainingLogger.cpp
index 344ca92e18b51..39f79cffdcd88 100644
--- a/llvm/lib/Analysis/TrainingLogger.cpp
+++ b/llvm/lib/Analysis/TrainingLogger.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
-#include <numeric>
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index 7ae9e0e37bbab..cd951a1a4f53e 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <numeric>
 
 using namespace llvm;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp
index a88f4a554bcf0..a4bdd1f0a867c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp
@@ -15,7 +15,6 @@
 #include <cassert>
 #include <cinttypes>
 #include <cstdint>
-#include <optional>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index c4aa2c7638450..45818deda8aa6 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -29,7 +29,6 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 #include <memory>
-#include <string>
 #include <utility>
 
 using namespace llvm;
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 6c140be59fc4b..8b95049eb9648 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -10,7 +10,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
-#include <optional>
 #include <sstream>
 
 #define DEBUG_TYPE "mustache"
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index ec785e407cc57..5dcd2c945bc85 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -23,7 +23,6 @@
 #include <fcntl.h>
 #include <io.h>
 #include <malloc.h>
-#include <numeric>
 #include <psapi.h>
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 068954f1764fb..0bf2b31b10846 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -54,7 +54,6 @@
 #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
 #include <memory>
 #include <optional>
-#include <string>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
index 0daeeb8f11cfe..338a7c8082ca3 100644
--- a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
+++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
-#include <optional>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 677203d1c016b..95577dd668e1e 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -29,7 +29,6 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <cstdint>
-#include <optional>
 
 using namespace llvm;
 using namespace llvm::dxil;
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h
index b990b6c7410ac..ec82aa93dd07c 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -21,7 +21,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/MC/DXContainerRootSignature.h"
 #include "llvm/Pass.h"
-#include <optional>
 
 namespace llvm {
 namespace dxil {
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
index f2c00c7320e11..7cbc092ea3525 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h
index 16ca7d2e6d0fd..4f9685814d9a9 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.h
+++ b/llvm/lib/Target/M68k/M68kSubtarget.h
@@ -27,8 +27,6 @@
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/Alignment.h"
 
-#include <string>
-
 #define GET_SUBTARGETINFO_HEADER
 #include "M68kGenSubtargetInfo.inc"
 
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index f275802fe1843..7d933588025fe 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -23,7 +23,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/TargetParser/Triple.h"
-#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "PPCGenSubtargetInfo.inc"
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h
index b1decca0a4f07..f575f6d7da37f 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -21,7 +21,6 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
-#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "SparcGenSubtargetInfo.inc"
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 0c2bd7c302f33..d4ad98af9b30c 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -50,7 +50,6 @@
 #include "llvm/Transforms/CFGuard.h"
 #include <memory>
 #include <optional>
-#include <string>
 
 using namespace llvm;
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 6dc647655d92f..77b2f491f6a72 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -28,7 +28,6 @@
 #include "llvm/TargetParser/Host.h"
 
 #include <memory>
-#include <string>
 #include <vector>
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) &&              \
     !defined(_M_ARM64EC)
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
index 5715058d8cfac..fa9ee185549a5 100644
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -26,7 +26,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <thread>
-#include <tuple>
 #include <utility>
 
 // The implicit arguments of COV5 AMDGPU kernels.
diff --git a/llvm/tools/llvm-mc/Disassembler.h b/llvm/tools/llvm-mc/Disassembler.h
index dd8525d73200e..76cee9e84c312 100644
--- a/llvm/tools/llvm-mc/Disassembler.h
+++ b/llvm/tools/llvm-mc/Disassembler.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
 #define LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
 
-#include <string>
-
 namespace llvm {
 
 class MemoryBuffer;
diff --git a/llvm/tools/llvm-xray/trie-node.h b/llvm/tools/llvm-xray/trie-node.h
index b42b0293620dd..f96be592fc364 100644
--- a/llvm/tools/llvm-xray/trie-node.h
+++ b/llvm/tools/llvm-xray/trie-node.h
@@ -15,7 +15,6 @@
 #define LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
 
 #include <forward_list>
-#include <numeric>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index c6ae6e36cfbff..d0092fdb52b01 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -11,7 +11,6 @@
 #include "gtest/gtest.h"
 
 #include <optional>
-#include <tuple>
 #include <type_traits>
 #include <utility>
 
diff --git a/llvm/unittests/ADT/SequenceTest.cpp b/llvm/unittests/ADT/SequenceTest.cpp
index 7aa39568888b2..ab50ad0bb5606 100644
--- a/llvm/unittests/ADT/SequenceTest.cpp
+++ b/llvm/unittests/ADT/SequenceTest.cpp
@@ -11,7 +11,9 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-#include <numeric>
+#include <iterator>
+#include <limits>
+#include <vector>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
index b40b61e7a4a6f..441344b281411 100644
--- a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
@@ -23,7 +23,6 @@
 #include "gtest/gtest.h"
 
 #include <algorithm>
-#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/llvm/unittests/IR/VFABIDemanglerTest.cpp b/llvm/unittests/IR/VFABIDemanglerTest.cpp
index e30e0f865f719..7d946131d4cba 100644
--- a/llvm/unittests/IR/VFABIDemanglerTest.cpp
+++ b/llvm/unittests/IR/VFABIDemanglerTest.cpp
@@ -15,7 +15,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
-#include <optional>
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/Common/CodeGenHwModes.h b/llvm/utils/TableGen/Common/CodeGenHwModes.h
index 55062b6ebeb35..7e45f8ef1de49 100644
--- a/llvm/utils/TableGen/Common/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/Common/CodeGenHwModes.h
@@ -15,7 +15,6 @@
 #include "llvm/ADT/StringRef.h"
 #include <cassert>
 #include <map>
-#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.h b/llvm/utils/TableGen/Common/CodeGenTarget.h
index b6f9d4209a13e..c1ed70d1739f0 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.h
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.h
@@ -28,7 +28,6 @@
 #include "llvm/CodeGenTypes/MachineValueType.h"
 #include <cassert>
 #include <memory>
-#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h
index 6e13864d12e16..261e17172abee 100644
--- a/llvm/utils/TableGen/TableGenBackends.h
+++ b/llvm/utils/TableGen/TableGenBackends.h
@@ -15,8 +15,6 @@
 #ifndef LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H
 #define LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H
 
-#include <string>
-
 // A TableGen backend is a function that looks like
 //
 //    EmitFoo(RecordKeeper &RK, raw_ostream &OS /*, anything else you need */ )

From 2705951a228222f3f642279a80dd095531f7d524 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 10 Nov 2025 12:32:38 +0000
Subject: [PATCH 04/28] [X86] 2012-01-10-UndefExceptionEdge.ll - regenerate
 test checks (#167307)

---
 .../X86/2012-01-10-UndefExceptionEdge.ll      | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 1962ddebc2115..f2b4c49b1dbcd 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:  Ltmp0: ## EH_LABEL
 ; CHECK-NEXT:    ## implicit-def: $ebx
 ; CHECK-NEXT:    calll __Znam
-; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:  Ltmp1: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.1: ## %bb11
 ; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    movb $1, %al
@@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    jne LBB0_9
 ; CHECK-NEXT:  ## %bb.10: ## %bb41
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
-; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:  Ltmp2: ## EH_LABEL
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, (%esp)
 ; CHECK-NEXT:    calll _Pjii
-; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:  Ltmp3: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.11: ## %bb42
 ; CHECK-NEXT:    ## in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
@@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    jmp LBB0_8
 ; CHECK-NEXT:  LBB0_18: ## %bb43
-; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:  Ltmp5: ## EH_LABEL
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    calll _OnOverFlow
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp6: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_3
 ; CHECK-NEXT:  LBB0_2: ## %bb29
-; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:  Ltmp7: ## EH_LABEL
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    calll _OnOverFlow
-; CHECK-NEXT:  Ltmp8:
+; CHECK-NEXT:  Ltmp8: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_3: ## %bb30
 ; CHECK-NEXT:    ud2
 ; CHECK-NEXT:  LBB0_4: ## %bb20.loopexit
-; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:  Ltmp4: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_9:
 ; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:  LBB0_6: ## %bb23
@@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_5: ## %bb20.loopexit.split-lp
-; CHECK-NEXT:  Ltmp9:
+; CHECK-NEXT:  Ltmp9: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_6
 ; CHECK-NEXT:  Lfunc_end0:
 bb:

From 309729e8279a675dcef2da8912c7679ebca5c641 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 10 Nov 2025 07:40:31 -0500
Subject: [PATCH 05/28] [C2y] Claim nonconformance to WG14 N3348 (#166966)

This paper allows use of * in a multidimensional array extent within a
_Generic selection association, as a wildcard for any array extent.

Clang does not currently support this feature, so this is just some
initial test coverage along with an update to the conformance site.
---
 clang/test/C/C2y/n3348.c | 44 ++++++++++++++++++++++++++++++++++++++++
 clang/www/c_status.html  |  2 +-
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/C/C2y/n3348.c

diff --git a/clang/test/C/C2y/n3348.c b/clang/test/C/C2y/n3348.c
new file mode 100644
index 0000000000000..e20c9f74883f9
--- /dev/null
+++ b/clang/test/C/C2y/n3348.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+
+/* WG14 N3348: No
+ * Matching of Multi-Dimensional Arrays in Generic Selection Expressions
+ *
+ * This allows use of * in a _Generic association as a placeholder for any size
+ * value.
+ *
+ * FIXME: Clang doesn't yet implement this paper. When we do implement it, we
+ * should expose the functionality in earlier language modes (C89) for
+ * compatibility with GCC.
+ */
+
+void test(int n, int m) {
+  static_assert(1 == _Generic(int[3][2], int[3][*]: 1, int[2][*]: 0));  /* expected-error {{star modifier used outside of function prototype}}
+                                                                           expected-error {{array has incomplete element type 'int[]'}}
+                                                                         */
+  static_assert(1 == _Generic(int[3][2], int[*][2]: 1, int[*][3]: 0));  // expected-error {{star modifier used outside of function prototype}}
+  static_assert(1 == _Generic(int[3][n], int[3][*]: 1, int[2][*]: 0));  /* expected-error {{star modifier used outside of function prototype}}
+                                                                           expected-error {{array has incomplete element type 'int[]'}}
+                                                                         */
+  static_assert(1 == _Generic(int[n][m], int[*][*]: 1, char[*][*]: 0)); /* expected-error 2 {{star modifier used outside of function prototype}}
+                                                                           expected-error {{array has incomplete element type 'int[]'}}
+                                                                         */
+  static_assert(1 == _Generic(int(*)[2], int(*)[*]: 1));                // expected-error {{star modifier used outside of function prototype}}
+}
+
+void questionable() {
+  // GCC accepts this despite the * appearing outside of a generic association,
+  // but it's not clear whether that's intentionally supported or an oversight.
+  // It gives a warning about * being used outside of a declaration, but not
+  // with an associated warning group.
+  static_assert(1 == _Generic(int[*][*], int[2][100]: 1)); /* expected-error 2 {{star modifier used outside of function prototype}}
+                                                              expected-error {{array has incomplete element type 'int[]'}}
+                                                            */
+  // GCC claims this matches multiple associations, so the functionality seems
+  // like it may be intended to work?
+  (void)_Generic(int[*][*], /* expected-error 2 {{star modifier used outside of function prototype}}
+                               expected-error {{array has incomplete element type 'int[]'}}
+                             */
+    int[2][100]: 1,
+    int[3][1000]: 2,
+  );
+}
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 2c1f6f4140a91..ccf19ea86957a 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -324,7 +324,7 @@ <h2 id="c2y">C2y implementation status</h2>
     <tr>
       <td>Matching of Multi-Dimensional Arrays in Generic Selection Expressions</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3348.pdf">N3348</a></td>
-      <td class="unknown" align="center">Unknown</td>
+      <td class="none" align="center">No</td>
 	</tr>
     <tr>
       <td>The __COUNTER__ predefined macro</td>

From 342e28f8a94128b00be576b3df6d7629bc97b88b Mon Sep 17 00:00:00 2001
From: "J. Ryan Stinnett" <jryans@gmail.com>
Date: Mon, 10 Nov 2025 12:45:05 +0000
Subject: [PATCH 06/28] [clang][DebugInfo] Attach `DISubprogram` to additional
 call variants (#166202)

`DISubprogram`s are attached to call sites to support various debug info
features, including entry values and tail calls. Clang 9.0
(0f6516856670a435461f56a9faeb4aa8a35a6679) was the first version to
include this kind of call site `DISubprogram` attachment.

This earlier work appears to visit only some call site variants,
however. The call site attachment was added to a higher-level `EmitCall`
path in Clang's code gen that is only used by some call variants. In
particular, some C++ member calls use a different code gen path, which
did not include this call site attachment step, and thus the debug info
it triggers (e.g. call site entries) was not emitted for such calls.

This moves `DISubprogram` attachment to a lower-level call emission path
that is used by all call variants.

Fixes https://github.com/llvm/llvm-project/issues/161962
---
 clang/lib/CodeGen/CGCall.cpp                  | 19 ++++++++++++++
 clang/lib/CodeGen/CGDebugInfo.cpp             |  8 ++++--
 clang/lib/CodeGen/CGDebugInfo.h               | 11 ++++----
 clang/lib/CodeGen/CGExpr.cpp                  |  9 -------
 clang/test/DebugInfo/CXX/decl-member-call.cpp | 25 +++++++++++++++++++
 5 files changed, 55 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/DebugInfo/CXX/decl-member-call.cpp

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 465f3f4e670c2..d4d5ea80a84ec 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -6277,6 +6278,24 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     pushDestroy(QualType::DK_nontrivial_c_struct, Ret.getAggregateAddress(),
                 RetTy);
 
+  // Generate function declaration DISuprogram in order to be used
+  // in debug info about call sites.
+  if (CGDebugInfo *DI = getDebugInfo()) {
+    // Ensure call site info would actually be emitted before collecting
+    // further callee info.
+    if (CalleeDecl && !CalleeDecl->hasAttr<NoDebugAttr>() &&
+        DI->getCallSiteRelatedAttrs() != llvm::DINode::FlagZero) {
+      CodeGenFunction CalleeCGF(CGM);
+      const GlobalDecl &CalleeGlobalDecl =
+          Callee.getAbstractInfo().getCalleeDecl();
+      CalleeCGF.CurGD = CalleeGlobalDecl;
+      FunctionArgList Args;
+      QualType ResTy = CalleeCGF.BuildFunctionArgList(CalleeGlobalDecl, Args);
+      DI->EmitFuncDeclForCallSite(
+          CI, DI->getFunctionType(CalleeDecl, ResTy, Args), CalleeGlobalDecl);
+    }
+  }
+
   return Ret;
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index ca579c915f49d..bda7b7487f59b 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -4957,7 +4957,7 @@ void CGDebugInfo::EmitFunctionDecl(GlobalDecl GD, SourceLocation Loc,
 
 void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
                                           QualType CalleeType,
-                                          const FunctionDecl *CalleeDecl) {
+                                          GlobalDecl CalleeGlobalDecl) {
   if (!CallOrInvoke)
     return;
   auto *Func = dyn_cast<llvm::Function>(CallOrInvoke->getCalledOperand());
@@ -4966,6 +4966,9 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
   if (Func->getSubprogram())
     return;
 
+  const FunctionDecl *CalleeDecl =
+      cast<FunctionDecl>(CalleeGlobalDecl.getDecl());
+
   // Do not emit a declaration subprogram for a function with nodebug
   // attribute, or if call site info isn't required.
   if (CalleeDecl->hasAttr<NoDebugAttr>() ||
@@ -4976,7 +4979,8 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
   // create the one describing the function in order to have complete
   // call site debug info.
   if (!CalleeDecl->isStatic() && !CalleeDecl->isInlined())
-    EmitFunctionDecl(CalleeDecl, CalleeDecl->getLocation(), CalleeType, Func);
+    EmitFunctionDecl(CalleeGlobalDecl, CalleeDecl->getLocation(), CalleeType,
+                     Func);
 }
 
 void CGDebugInfo::EmitInlineFunctionStart(CGBuilderTy &Builder, GlobalDecl GD) {
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 78c3eb9c5792e..2378bdd780b3b 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -511,7 +511,7 @@ class CGDebugInfo {
   /// This is needed for call site debug info.
   void EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke,
                                QualType CalleeType,
-                               const FunctionDecl *CalleeDecl);
+                               GlobalDecl CalleeGlobalDecl);
 
   /// Constructs the debug code for exiting a function.
   void EmitFunctionEnd(CGBuilderTy &Builder, llvm::Function *Fn);
@@ -678,6 +678,10 @@ class CGDebugInfo {
   /// Emit symbol for debugger that holds the pointer to the vtable.
   void emitVTableSymbol(llvm::GlobalVariable *VTable, const CXXRecordDecl *RD);
 
+  /// Return flags which enable debug info emission for call sites, provided
+  /// that it is supported and enabled.
+  llvm::DINode::DIFlags getCallSiteRelatedAttrs() const;
+
 private:
   /// Amend \p I's DebugLoc with \p Group (its source atom group) and \p
   /// Rank (lower nonzero rank is higher precedence). Does nothing if \p I
@@ -827,11 +831,6 @@ class CGDebugInfo {
                          unsigned LineNo, StringRef LinkageName,
                          llvm::GlobalVariable *Var, llvm::DIScope *DContext);
 
-
-  /// Return flags which enable debug info emission for call sites, provided
-  /// that it is supported and enabled.
-  llvm::DINode::DIFlags getCallSiteRelatedAttrs() const;
-
   /// Get the printing policy for producing names for debug info.
   PrintingPolicy getPrintingPolicy() const;
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 01f2161f27555..a837f00732748 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6632,15 +6632,6 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
                          E == MustTailCall, E->getExprLoc());
 
   if (auto *CalleeDecl = dyn_cast_or_null<FunctionDecl>(TargetDecl)) {
-    // Generate function declaration DISuprogram in order to be used
-    // in debug info about call sites.
-    if (CGDebugInfo *DI = getDebugInfo()) {
-      FunctionArgList Args;
-      QualType ResTy = BuildFunctionArgList(CalleeDecl, Args);
-      DI->EmitFuncDeclForCallSite(LocalCallOrInvoke,
-                                  DI->getFunctionType(CalleeDecl, ResTy, Args),
-                                  CalleeDecl);
-    }
     if (CalleeDecl->hasAttr<RestrictAttr>() ||
         CalleeDecl->hasAttr<AllocSizeAttr>()) {
       // Function has 'malloc' (aka. 'restrict') or 'alloc_size' attribute.
diff --git a/clang/test/DebugInfo/CXX/decl-member-call.cpp b/clang/test/DebugInfo/CXX/decl-member-call.cpp
new file mode 100644
index 0000000000000..95758a2985c0c
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/decl-member-call.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -O1 -triple x86_64-unknown_unknown -emit-llvm \
+// RUN:   -debug-info-kind=standalone -dwarf-version=5 %s -o - | FileCheck %s
+
+// Ensure both nonmember and member calls to declared function
+// have attached `DISubprogram`s.
+
+int nonmember(int n);
+
+struct S {
+  int x;
+  int member(int n);
+};
+
+int main(int argc, char** argv) {
+  struct S s = {};
+  int a = s.member(argc);
+  int b = nonmember(argc);
+  return a + b;
+}
+
+// CHECK: declare !dbg ![[SP1:[0-9]+]] noundef i32 @_ZN1S6memberEi(
+// CHECK: declare !dbg ![[SP2:[0-9]+]] noundef i32 @_Z9nonmemberi(
+
+// CHECK: ![[SP1]] = !DISubprogram(name: "member", linkageName: "_ZN1S6memberEi"
+// CHECK: ![[SP2]] = !DISubprogram(name: "nonmember", linkageName: "_Z9nonmemberi"

From f22d5884b2847c575b83b067c1c6b69f224f0854 Mon Sep 17 00:00:00 2001
From: Tim Noack <tim@noack.id>
Date: Mon, 10 Nov 2025 13:45:31 +0100
Subject: [PATCH 07/28] [mlir] Dialect Conversion: Fix expensive pattern check
 in no-rollback mode (#166576)

Fixes a bug causing every conversion to fail fatally with "expected
pattern to replace the root operation or modify it in place" when
`MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS` is enabled and pattern
rollback is disabled.

When `allowPatternRollback` is disabled, the rewriter does not keep
track of the rewrites it performs and can therefore not use that list to
check whether the root op was replaced or updated in place.
---
 .../Transforms/Utils/DialectConversion.cpp    | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index f8c38fadbd229..9945a711d5c74 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2856,17 +2856,19 @@ LogicalResult OperationLegalizer::legalizePatternResult(
   assert(impl.pendingRootUpdates.empty() && "dangling root updates");
 
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-  // Check that the root was either replaced or updated in place.
-  auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites);
-  auto replacedRoot = [&] {
-    return hasRewrite<ReplaceOperationRewrite>(newRewrites, op);
-  };
-  auto updatedRootInPlace = [&] {
-    return hasRewrite<ModifyOperationRewrite>(newRewrites, op);
-  };
-  if (!replacedRoot() && !updatedRootInPlace())
-    llvm::report_fatal_error(
-        "expected pattern to replace the root operation or modify it in place");
+  if (impl.config.allowPatternRollback) {
+    // Check that the root was either replaced or updated in place.
+    auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites);
+    auto replacedRoot = [&] {
+      return hasRewrite<ReplaceOperationRewrite>(newRewrites, op);
+    };
+    auto updatedRootInPlace = [&] {
+      return hasRewrite<ModifyOperationRewrite>(newRewrites, op);
+    };
+    if (!replacedRoot() && !updatedRootInPlace())
+      llvm::report_fatal_error("expected pattern to replace the root operation "
+                               "or modify it in place");
+  }
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
 
   // Legalize each of the actions registered during application.

From be84705a9d39d2a7fd78cebf91dceec7bcb2c7a9 Mon Sep 17 00:00:00 2001
From: Tim Corringham <timothy.corringham@amd.com>
Date: Mon, 10 Nov 2025 12:57:19 +0000
Subject: [PATCH 08/28] [HLSL][SPIRV] Add error test for unpackhalf2x16
 (#166969)

Add an error test to check that a suitable error diagnostic is generated
for the use of the GL::unpackhalf2x16 operation in
 invalid contexts.

Fixes #166965

Co-authored-by: Tim Corringham <tcorring@amd.com>
---
 .../test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll

diff --git a/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll
new file mode 100644
index 0000000000000..1d3ba2a38e55b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: %5:vfid(<2 x s64>) = nnan ninf nsz arcp afn reassoc G_INTRINSIC intrinsic(@llvm.spv.unpackhalf2x16), %0:iid(s64) is only supported with the GLSL extended instruction set.
+
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+  %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+  %3 = extractelement <2 x float> %2, i64 0
+  ret float %3
+}
+

From bba40ab4bd60e636e77362c46c181eafd377f541 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <71938912+akroviakov@users.noreply.github.com>
Date: Mon, 10 Nov 2025 14:14:11 +0100
Subject: [PATCH 09/28] [MLIR][XeGPU] Decouple `inst_data` and `lane_layout` in
 propagation (#166941)

---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  11 +
 .../mlir/Dialect/XeGPU/Transforms/Passes.td   |   2 +-
 .../GPU/Pipelines/GPUToXeVMPipeline.cpp       |   9 +-
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 194 +++++++++++-------
 .../XeGPU/propagate-layout-inst-data.mlir     |  42 ++--
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |   2 +-
 6 files changed, 166 insertions(+), 94 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 9c35c07a7e587..3f27d690f949b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -379,6 +379,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
   );
 
   let builders = [
+    AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data),
+      [{
+        auto sg_layout = DenseI32ArrayAttr();
+        auto sg_data = DenseI32ArrayAttr();
+        auto order = DenseI32ArrayAttr();
+        auto lane_layout = DenseI32ArrayAttr();
+        auto lane_data = DenseI32ArrayAttr();
+        return $_get($_ctxt, sg_layout, sg_data,
+                     DenseI32ArrayAttr::get($_ctxt, inst_data),
+                     lane_layout, lane_data, order);
+      }]>,
     AttrBuilder<(ins "llvm::ArrayRef<int32_t>": $inst_data,
                       "llvm::ArrayRef<int32_t>": $lane_layout,
                      "llvm::ArrayRef<int32_t>": $lane_data),
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index e42799689e490..12270af870b3b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -47,7 +47,7 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
     Option<
     "layoutKind", "layout-kind", "std::string",
     /*default=*/"\"lane\"",
-    "Propagate a `sg` / `inst` / `lane` level of xegpu layouts.">
+    "Propagate `inst` / `lane` level of xegpu layouts.">
   ];
 }
 
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
index 1a1485ba2e02c..b097d3a0c9686 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -63,13 +63,20 @@ void buildGPUPassPipeline(OpPassManager &pm,
   if (options.xegpuOpLevel == "workgroup") {
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUWgToSgDistribute());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    xegpu::XeGPUPropagateLayoutOptions layoutOptions;
+    layoutOptions.layoutKind = "inst";
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        xegpu::createXeGPUPropagateLayout(layoutOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUBlocking());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
   }
   if (options.xegpuOpLevel == "subgroup" ||
       options.xegpuOpLevel == "workgroup") {
-    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPropagateLayout());
+    xegpu::XeGPUPropagateLayoutOptions layoutOptions;
+    layoutOptions.layoutKind = "lane";
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        xegpu::createXeGPUPropagateLayout(layoutOptions));
     pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
     pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
     pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 4e1a539771d2f..b3a780abd3f12 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -53,6 +53,8 @@ using namespace mlir::dataflow;
 
 namespace {
 
+enum class LayoutKind { Lane, InstData };
+
 //===----------------------------------------------------------------------===//
 // LayoutInfo
 //===----------------------------------------------------------------------===//
@@ -166,7 +168,8 @@ LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
   llvm_unreachable("Join should not be triggered by layout propagation.");
 }
 
-/// Construct a new layout with the transposed lane layout and lane data.
+/// Construct a new layout with the transposed inst_data or lane_layout,
+/// lane_data.
 LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
   if (!isAssigned())
     return {};
@@ -186,12 +189,20 @@ LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
   SmallVector<int32_t> laneData;
   SmallVector<int32_t> instData;
   for (int64_t idx : permutation) {
-    laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
-    laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
-    instData.push_back(static_cast<int32_t>(getInstData()[idx]));
+    if (getLaneLayout().size()) {
+      laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
+      laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
+    }
+    if (getInstData().size())
+      instData.push_back(static_cast<int32_t>(getInstData()[idx]));
   }
-  return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData,
-                                           laneLayout, laneData));
+  xegpu::LayoutAttr layoutAttr;
+  if (getLaneLayout().size())
+    layoutAttr =
+        xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData);
+  if (getInstData().size())
+    layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), instData);
+  return LayoutInfo(layoutAttr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -213,15 +224,14 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
 /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
 static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
                                            unsigned rank,
-                                           const xegpu::uArch::uArch *uArch,
-                                           ArrayRef<int> instData) {
+                                           const xegpu::uArch::uArch *uArch) {
   assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
   if (rank == 1) {
     return LayoutInfo(
-        xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1}));
+        xegpu::LayoutAttr::get(ctx, {uArch->getSubgroupSize()}, {1}));
   }
-  return LayoutInfo(xegpu::LayoutAttr::get(
-      ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1}));
+  return LayoutInfo(
+      xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1}));
 }
 
 static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
@@ -236,7 +246,6 @@ static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
                                            const xegpu::uArch::uArch *uArch,
-                                           ArrayRef<int> instData,
                                            unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
@@ -247,16 +256,16 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (vectorTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData);
+    return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch);
   // Packing factor is determined by the element type bitwidth.
   unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth();
   int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
-    return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+    return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
                                              {uArch->getSubgroupSize(), 1},
                                              {1, packingFactor}));
   }
-  return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData,
+  return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(),
                                            {1, uArch->getSubgroupSize()},
                                            {1, packingFactor}));
 }
@@ -264,7 +273,6 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy,
 /// Helper to get the default layout for a vector type.
 static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
                                            const xegpu::uArch::uArch *uArch,
-                                           ArrayRef<int> instData,
                                            unsigned packingSize,
                                            bool isScattered = false) {
   // Expecting a 1D or 2D vector.
@@ -275,18 +283,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
          "Expected int or float element type.");
   // If the rank is 1, then return default layout for 1D vector.
   if (tdescTy.getRank() == 1)
-    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData);
+    return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch);
   // Packing factor is determined by the element type bitwidth.
   unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth();
   int subgroupSize = uArch->getSubgroupSize();
   int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
   if (isScattered) {
     return LayoutInfo(xegpu::LayoutAttr::get(
-        tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor}));
+        tdescTy.getContext(), {subgroupSize, 1}, {1, packingFactor}));
   }
 
   return LayoutInfo(xegpu::LayoutAttr::get(
-      tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor}));
+      tdescTy.getContext(), {1, subgroupSize}, {1, packingFactor}));
 }
 
 /// Helper Function to get the expected layouts for DPAS operands. `lane_data`
@@ -298,7 +306,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy,
 static LayoutInfo
 getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
                                 const xegpu::uArch::uArch *uArch,
-                                ArrayRef<int> instData, unsigned packingSize) {
+                                unsigned packingSize) {
   Type elementTy = vectorTy.getElementType();
   assert(elementTy.isIntOrFloat() &&
          "Expected int or float type in DPAS operands");
@@ -310,10 +318,10 @@ getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
         {static_cast<int32_t>(packingSize / elementTy.getIntOrFloatBitWidth()),
          1});
     return LayoutInfo(
-        xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data));
+        xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data));
   }
   // Otherwise, return the default layout for the vector type.
-  return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize);
+  return getDefaultSIMTLayoutInfo(vectorTy, uArch, packingSize);
 }
 
 //===----------------------------------------------------------------------===//
@@ -328,6 +336,7 @@ getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum,
 class LayoutInfoPropagation
     : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
 private:
+  LayoutKind layoutKind;
   void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
                    ArrayRef<const LayoutInfoLattice *> results);
 
@@ -380,8 +389,10 @@ class LayoutInfoPropagation
 
 public:
   LayoutInfoPropagation(DataFlowSolver &solver,
-                        SymbolTableCollection &symbolTable)
-      : SparseBackwardDataFlowAnalysis(solver, symbolTable) {}
+                        SymbolTableCollection &symbolTable,
+                        LayoutKind layoutKind)
+      : SparseBackwardDataFlowAnalysis(solver, symbolTable),
+        layoutKind(layoutKind) {}
   using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
 
   LogicalResult
@@ -499,8 +510,14 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
           "No suitable instruction multiple found for the given shape.");
     instData = {instHeight, instWidth};
   }
-  auto prefetchLayout = getDefaultSIMTLayoutInfo(
-      tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize());
+  LayoutInfo prefetchLayout;
+  if (layoutKind == LayoutKind::InstData)
+    prefetchLayout =
+        LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
+  else
+    prefetchLayout = getDefaultSIMTLayoutInfo(
+        tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());
+
   // Propagate the layout to the source tensor descriptor.
   propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
 }
@@ -627,14 +644,24 @@ void LayoutInfoPropagation::visitDpasOp(
   SmallVector<int> instDataA = {maxALen, subgroupSize};
   SmallVector<int> instDataB = {subgroupSize, maxBLen};
 
-  propagateIfChanged(operands[0],
-                     operands[0]->meet(getSIMTLayoutInfoForDPASOperand(
-                         aTy, 0, uArch, instDataA,
-                         uArchInstruction->getPackedFormatBitSizeA())));
-  propagateIfChanged(operands[1],
-                     operands[1]->meet(getSIMTLayoutInfoForDPASOperand(
-                         bTy, 1, uArch, instDataB,
-                         uArchInstruction->getPackedFormatBitSizeB())));
+  LayoutInfo dpasALayout;
+  LayoutInfo dpasBLayout;
+  LayoutInfo dpasCLayout;
+
+  if (layoutKind == LayoutKind::InstData) {
+    dpasALayout =
+        LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA));
+    dpasBLayout =
+        LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB));
+  } else {
+    dpasALayout = getSIMTLayoutInfoForDPASOperand(
+        aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA());
+    dpasBLayout = getSIMTLayoutInfoForDPASOperand(
+        bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB());
+  }
+
+  propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
+  propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
   if (operands.size() > 2) {
     VectorType cTy = dpas.getAccType();
     const unsigned dataCLen = bTy.getShape().back();
@@ -645,10 +672,15 @@ void LayoutInfoPropagation::visitDpasOp(
       dpas.emitWarning(
           "No suitable instruction multiple found for the given shape.");
     SmallVector<int> instDataC = {maxALen, maxCLen};
-    propagateIfChanged(operands[2],
-                       operands[2]->meet(getSIMTLayoutInfoForDPASOperand(
-                           cTy, 2, uArch, instDataC,
-                           uArchInstruction->getPackedFormatBitSizeB())));
+
+    if (layoutKind == LayoutKind::InstData)
+      dpasCLayout =
+          LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC));
+    else
+      dpasCLayout = getSIMTLayoutInfoForDPASOperand(
+          cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB());
+
+    propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout));
   }
 }
 
@@ -685,9 +717,15 @@ void LayoutInfoPropagation::visitStoreNdOp(
           "No suitable instruction multiple found for the given shape.");
     instData = {instHeight, instWidth};
   }
-  LayoutInfo storeLayout =
-      getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData,
-                               uArchInstruction->getPackedFormatBitSize());
+
+  LayoutInfo storeLayout;
+  if (layoutKind == LayoutKind::InstData)
+    storeLayout =
+        LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
+  else
+    storeLayout =
+        getDefaultSIMTLayoutInfo(store.getValueType(), uArch,
+                                 uArchInstruction->getPackedFormatBitSize());
   // Both operands should have the same layout
   for (LayoutInfoLattice *operand : operands)
     propagateIfChanged(operand, operand->meet(storeLayout));
@@ -818,9 +856,13 @@ void LayoutInfoPropagation::visitLoadGatherOp(
     if (srcTdescTy.getChunkSizeAsInt() > 1)
       instData.push_back(chunkSize);
   }
-  LayoutInfo layout = getDefaultSIMTLayoutInfo(
-      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
-      /*scattered*/ true);
+  LayoutInfo layout;
+  if (layoutKind == LayoutKind::InstData)
+    layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData));
+  else
+    layout = getDefaultSIMTLayoutInfo(payloadTy, uArch,
+                                      uArch->getGeneralPackedFormatBitSize(),
+                                      /*scattered*/ true);
 
   // Mask operand should have 1D default layout.
   LayoutInfo maskLayout =
@@ -864,33 +906,36 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
     return;
   }
+  LayoutInfo payloadLayout;
   auto uArch = getUArch(getChipStr(storeScatter).value_or(""));
   const int subgroupSize = uArch->getSubgroupSize();
 
-  auto payloadShape = payloadTy.getShape();
-  if (payloadShape.size() > 1)
-    assert(
-        payloadShape[0] == subgroupSize &&
-        "Expected the first dimension of 2D tensor descriptor to be equal to "
-        "subgroup size.");
-
-  SmallVector<int> instData{subgroupSize};
-  if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1)
-    instData.push_back(chunkSize);
-  else if (auto dstTdescTy =
-               dyn_cast<xegpu::TensorDescType>(storeScatter.getDestType())) {
-    if (dstTdescTy.getChunkSizeAsInt() > 1)
-      instData.push_back(chunkSize);
-  }
-
-  LayoutInfo payloadLayout;
-
   if (auto layout = storeScatter.getLayoutAttr()) {
     payloadLayout = LayoutInfo(layout);
   } else {
-    payloadLayout = getDefaultSIMTLayoutInfo(
-        payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
-        /*scattered=*/true);
+    if (layoutKind == LayoutKind::InstData) {
+      SmallVector<int> instData{subgroupSize};
+      if (auto chunkSize = storeScatter.getChunkSize().value_or(0);
+          chunkSize > 1)
+        instData.push_back(chunkSize);
+      else if (auto dstTdescTy = dyn_cast<xegpu::TensorDescType>(
+                   storeScatter.getDestType())) {
+        if (dstTdescTy.getChunkSizeAsInt() > 1)
+          instData.push_back(chunkSize);
+      }
+      payloadLayout = LayoutInfo(
+          xegpu::LayoutAttr::get(storeScatter.getContext(), instData));
+    } else {
+      auto payloadShape = payloadTy.getShape();
+      if (payloadShape.size() > 1)
+        assert(payloadShape[0] == subgroupSize &&
+               "Expected the first dimension of 2D tensor descriptor to be "
+               "equal to "
+               "subgroup size.");
+      payloadLayout = getDefaultSIMTLayoutInfo(
+          payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(),
+          /*scattered=*/true);
+    }
   }
 
   LayoutInfo maskLayout =
@@ -916,10 +961,10 @@ class RunLayoutInfoPropagation {
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)
 
-  RunLayoutInfoPropagation(Operation *op) : target(op) {
+  RunLayoutInfoPropagation(Operation *op, LayoutKind layoutKind) : target(op) {
     SymbolTableCollection symbolTable;
     loadBaselineAnalyses(solver);
-    solver.load<LayoutInfoPropagation>(symbolTable);
+    solver.load<LayoutInfoPropagation>(symbolTable, layoutKind);
     (void)solver.initializeAndRun(op);
   }
 
@@ -1159,7 +1204,18 @@ struct XeGPUPropagateLayoutPass final
 } // namespace
 
 void XeGPUPropagateLayoutPass::runOnOperation() {
-  auto &analysis = getAnalysis<RunLayoutInfoPropagation>();
+  LayoutKind layoutKind;
+  if (this->layoutKind == "lane") {
+    layoutKind = LayoutKind::Lane;
+  } else if (this->layoutKind == "inst") {
+    layoutKind = LayoutKind::InstData;
+  } else {
+    getOperation()->emitError("Unsupported layout kind option: " +
+                              this->layoutKind);
+    signalPassFailure();
+    return;
+  }
+  RunLayoutInfoPropagation analysis(getOperation(), layoutKind);
   // Print the analysis result and exit. (for debugging purposes)
   if (printOnly) {
     auto &os = llvm::outs();
@@ -1173,8 +1229,6 @@ void XeGPUPropagateLayoutPass::runOnOperation() {
       return {};
     xegpu::DistributeLayoutAttr layoutAttr =
         cast<xegpu::DistributeLayoutAttr>(layout.get());
-    if (this->layoutKind == "lane")
-      layoutAttr = layoutAttr.dropInstData();
     if (layout.isSliceLayout())
       return cast<xegpu::SliceAttr>(layoutAttr);
     return cast<xegpu::LayoutAttr>(layoutAttr);
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
index 58461b8be52c4..c31ef323a94d2 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir
@@ -2,17 +2,17 @@
 
 // CHECK-LABEL: func.func @dpas_f16(
 // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
-// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
-// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>} :
-// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32>
+// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>>
+// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]]  {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16>
+// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]]  {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} :
+// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16>
+// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
 // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
-// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>
+// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>>
 gpu.module @test {
 
 func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
@@ -46,18 +46,18 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x32xf16>
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16>
 
-      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x32xf16>
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<16x32xf16>
       %c = arith.addf %a, %b : vector<16x32xf16>
 
-      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
       xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16>
 
-      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
       %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
       %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16>
@@ -85,18 +85,18 @@ gpu.module @test_kernel {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc)
       -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) {
-      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} :
-      //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<12x32xf16>
+      //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} :
+      //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16>
       %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
       %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16>
 
-      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<12x32xf16>
+      //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : vector<12x32xf16>
       %c = arith.addf %a, %b : vector<12x32xf16>
 
-      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>>
+      //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
       xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16>
 
-      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+      //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>>
       %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
       %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
       %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16>
@@ -114,7 +114,7 @@ gpu.module @test {
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
 // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
 // CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}>
-// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8], lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
 // CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
 func.func @scatter_ops_chunksize(%src: memref<256xf16>) {
   %1 = arith.constant dense<1>: vector<16xi1>
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 61e315d0d2080..eb004932af4be 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=lane" -split-input-file %s | FileCheck %s
 
 gpu.module @test {
 // CHECK-LABEL: func.func @dpas_f16(

From 5b204530629cabecfeba6f890dd7cb03e0b1a2e2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 10 Nov 2025 14:39:43 +0000
Subject: [PATCH 10/28] [CodeGenPrepare] sinkCmpExpression - don't sink larger
 than legal integer comparisons (#166778)

A generic alternative to #166564 - make the assumption that expanding
integer comparisons will be expensive if they are larger than the largest
legal type so avoid sinking if they are also used in the current BB + any phis.

Fixes #166534
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 17 ++++-
 .../test/CodeGen/RISCV/overflow-intrinsics.ll | 48 ++++++-------
 llvm/test/CodeGen/X86/pr166534.ll             | 68 +++++--------------
 3 files changed, 53 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 0309e225d9df4..b6dd174f9be80 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1839,7 +1839,8 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
 /// lose; some adjustment may be wanted there.
 ///
 /// Return true if any changes are made.
-static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
+static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI,
+                              const DataLayout &DL) {
   if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType())))
     return false;
 
@@ -1847,6 +1848,18 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
   if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
     return false;
 
+  bool UsedInPhiOrCurrentBlock = any_of(Cmp->users(), [Cmp](User *U) {
+    return isa<PHINode>(U) ||
+           cast<Instruction>(U)->getParent() == Cmp->getParent();
+  });
+
+  // Avoid sinking larger than legal integer comparisons unless its ONLY used in
+  // another BB.
+  if (UsedInPhiOrCurrentBlock && Cmp->getOperand(0)->getType()->isIntegerTy() &&
+      Cmp->getOperand(0)->getType()->getScalarSizeInBits() >
+          DL.getLargestLegalIntTypeSizeInBits())
+    return false;
+
   // Only insert a cmp in each block once.
   DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
 
@@ -2224,7 +2237,7 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
 }
 
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
-  if (sinkCmpExpression(Cmp, *TLI))
+  if (sinkCmpExpression(Cmp, *TLI, *DL))
     return true;
 
   if (combineToUAddWithOverflow(Cmp, ModifiedDT))
diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index ba6769b2aa3e1..0306bb18c2aed 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp {
   ret i64 %Q
 }
 
-; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
+; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic.
 
 define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
 ; RV32-LABEL: uaddo4:
@@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    .cfi_offset s4, -24
 ; RV32-NEXT:    .cfi_offset s5, -28
 ; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    mv s5, a5
-; RV32-NEXT:    mv s3, a1
+; RV32-NEXT:    mv s1, a5
+; RV32-NEXT:    mv s4, a1
 ; RV32-NEXT:    andi a1, a5, 1
-; RV32-NEXT:    beqz a1, .LBB32_8
+; RV32-NEXT:    beqz a1, .LBB32_6
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    mv s0, a4
-; RV32-NEXT:    mv s2, a3
-; RV32-NEXT:    mv s1, a2
-; RV32-NEXT:    mv s4, a0
-; RV32-NEXT:    beq s3, a3, .LBB32_3
+; RV32-NEXT:    mv s3, a3
+; RV32-NEXT:    mv s2, a2
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    beq s4, a3, .LBB32_3
 ; RV32-NEXT:  # %bb.2: # %t
-; RV32-NEXT:    sltu s6, s3, s2
+; RV32-NEXT:    sltu s6, s4, s3
 ; RV32-NEXT:    j .LBB32_4
 ; RV32-NEXT:  .LBB32_3:
-; RV32-NEXT:    sltu s6, s4, s1
+; RV32-NEXT:    sltu s6, s5, s2
 ; RV32-NEXT:  .LBB32_4: # %t
 ; RV32-NEXT:    mv a0, s6
 ; RV32-NEXT:    call call
-; RV32-NEXT:    beqz s6, .LBB32_8
+; RV32-NEXT:    beqz s6, .LBB32_6
 ; RV32-NEXT:  # %bb.5: # %end
-; RV32-NEXT:    sltu a1, s4, s1
-; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beq s3, s2, .LBB32_7
-; RV32-NEXT:  # %bb.6: # %end
-; RV32-NEXT:    sltu a0, s3, s2
-; RV32-NEXT:  .LBB32_7: # %end
-; RV32-NEXT:    sub a2, s3, s2
-; RV32-NEXT:    sub a3, s4, s1
-; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    sw a3, 0(s0)
-; RV32-NEXT:    sw a2, 4(s0)
-; RV32-NEXT:    j .LBB32_9
-; RV32-NEXT:  .LBB32_8: # %f
-; RV32-NEXT:    mv a0, s5
-; RV32-NEXT:  .LBB32_9: # %f
+; RV32-NEXT:    sltu a0, s5, s2
+; RV32-NEXT:    sub a1, s4, s3
+; RV32-NEXT:    sub a2, s5, s2
+; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    sw a2, 0(s0)
+; RV32-NEXT:    sw a1, 4(s0)
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    j .LBB32_7
+; RV32-NEXT:  .LBB32_6: # %f
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:  .LBB32_7: # %f
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index aef44cc3e40d0..162a0c93bfcf4 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -7,100 +7,64 @@
 define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ; SSE2-LABEL: pr166534:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movq (%rdi), %rax
-; SSE2-NEXT:    movq 8(%rdi), %r8
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movq (%rsi), %r9
-; SSE2-NEXT:    movq 8(%rsi), %rdi
 ; SSE2-NEXT:    movdqu (%rsi), %xmm1
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %esi
-; SSE2-NEXT:    xorl %r10d, %r10d
+; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
-; SSE2-NEXT:    sete %r10b
-; SSE2-NEXT:    orq %r10, (%rdx)
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    orq %rax, (%rdx)
 ; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
 ; SSE2-NEXT:    jne .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %if.then
-; SSE2-NEXT:    xorq %r9, %rax
-; SSE2-NEXT:    xorq %rdi, %r8
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    orq %rax, %r8
-; SSE2-NEXT:    sete %dl
-; SSE2-NEXT:    orq %rdx, (%rcx)
+; SSE2-NEXT:    orq %rax, (%rcx)
 ; SSE2-NEXT:  .LBB0_2: # %if.end
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: pr166534:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movq (%rdi), %rax
-; SSE4-NEXT:    movq 8(%rdi), %r8
 ; SSE4-NEXT:    movdqu (%rdi), %xmm0
-; SSE4-NEXT:    movq (%rsi), %r9
-; SSE4-NEXT:    movq 8(%rsi), %rdi
 ; SSE4-NEXT:    movdqu (%rsi), %xmm1
 ; SSE4-NEXT:    pxor %xmm0, %xmm1
-; SSE4-NEXT:    xorl %esi, %esi
+; SSE4-NEXT:    xorl %eax, %eax
 ; SSE4-NEXT:    ptest %xmm1, %xmm1
-; SSE4-NEXT:    sete %sil
-; SSE4-NEXT:    orq %rsi, (%rdx)
+; SSE4-NEXT:    sete %al
+; SSE4-NEXT:    orq %rax, (%rdx)
 ; SSE4-NEXT:    ptest %xmm1, %xmm1
 ; SSE4-NEXT:    jne .LBB0_2
 ; SSE4-NEXT:  # %bb.1: # %if.then
-; SSE4-NEXT:    xorq %r9, %rax
-; SSE4-NEXT:    xorq %rdi, %r8
-; SSE4-NEXT:    xorl %edx, %edx
-; SSE4-NEXT:    orq %rax, %r8
-; SSE4-NEXT:    sete %dl
-; SSE4-NEXT:    orq %rdx, (%rcx)
+; SSE4-NEXT:    orq %rax, (%rcx)
 ; SSE4-NEXT:  .LBB0_2: # %if.end
 ; SSE4-NEXT:    retq
 ;
 ; AVX2-LABEL: pr166534:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    movq 8(%rdi), %r8
 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX2-NEXT:    movq (%rsi), %rdi
 ; AVX2-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    movq 8(%rsi), %rsi
-; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    vptest %xmm0, %xmm0
-; AVX2-NEXT:    sete %r9b
-; AVX2-NEXT:    orq %r9, (%rdx)
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    orq %rax, (%rdx)
 ; AVX2-NEXT:    vptest %xmm0, %xmm0
 ; AVX2-NEXT:    jne .LBB0_2
 ; AVX2-NEXT:  # %bb.1: # %if.then
-; AVX2-NEXT:    xorq %rdi, %rax
-; AVX2-NEXT:    xorq %rsi, %r8
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    orq %rax, %r8
-; AVX2-NEXT:    sete %dl
-; AVX2-NEXT:    orq %rdx, (%rcx)
+; AVX2-NEXT:    orq %rax, (%rcx)
 ; AVX2-NEXT:  .LBB0_2: # %if.end
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: pr166534:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    movq 8(%rdi), %r8
 ; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512-NEXT:    movq (%rsi), %r9
-; AVX512-NEXT:    movq 8(%rsi), %rdi
 ; AVX512-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    xorl %eax, %eax
 ; AVX512-NEXT:    vptest %xmm0, %xmm0
-; AVX512-NEXT:    sete %sil
-; AVX512-NEXT:    orq %rsi, (%rdx)
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    orq %rax, (%rdx)
 ; AVX512-NEXT:    vptest %xmm0, %xmm0
 ; AVX512-NEXT:    jne .LBB0_2
 ; AVX512-NEXT:  # %bb.1: # %if.then
-; AVX512-NEXT:    xorq %r9, %rax
-; AVX512-NEXT:    xorq %rdi, %r8
-; AVX512-NEXT:    xorl %edx, %edx
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    sete %dl
-; AVX512-NEXT:    orq %rdx, (%rcx)
+; AVX512-NEXT:    orq %rax, (%rcx)
 ; AVX512-NEXT:  .LBB0_2: # %if.end
 ; AVX512-NEXT:    retq
 entry:

From a5d4ba7be4399c8b4aca9e841ca1d145c7fb57f3 Mon Sep 17 00:00:00 2001
From: Raul Tambre <raul@tambre.ee>
Date: Mon, 10 Nov 2025 16:44:43 +0200
Subject: [PATCH 11/28] [clang][doc] Document and prefer __asm as mangled name
 attribute (#167226)

It's supported together with the other spellings and results in the same attribute.
Document it and prefer it in the documentation as the `asm()` spelling is C++ and GNU-only.

See: https://github.com/llvm/llvm-project/pull/167221#issuecomment-3508464545
---
 clang/include/clang/Basic/Attr.td     | 2 +-
 clang/include/clang/Basic/AttrDocs.td | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 1013bfc575747..da608370645ac 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1069,7 +1069,7 @@ def AVRSignal : InheritableAttr, TargetSpecificAttr<TargetAVR> {
 }
 
 def AsmLabel : InheritableAttr {
-  let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm__">];
+  let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm">, CustomKeyword<"__asm__">];
   let Args = [
       // Label specifies the mangled name for the decl.
       StringArgument<"Label">, ];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 1be9a96aa44de..f1dbd8af6093a 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4295,17 +4295,17 @@ used by other languages. (This prefix is also added to the standard Itanium
 C++ ABI prefix on "mangled" symbol names, so that e.g. on such targets the true
 symbol name for a C++ variable declared as ``int cppvar;`` would be
 ``__Z6cppvar``; note the two underscores.)  This prefix is *not* added to the
-symbol names specified by the ``asm`` attribute; programmers wishing to match a
-C symbol name must compensate for this.
+symbol names specified by the ``__asm`` attribute; programmers wishing to match
+a C symbol name must compensate for this.
 
 For example, consider the following C code:
 
 .. code-block:: c
 
-  int var1 asm("altvar") = 1;  // "altvar" in symbol table.
+  int var1 __asm("altvar") = 1;  // "altvar" in symbol table.
   int var2 = 1; // "_var2" in symbol table.
 
-  void func1(void) asm("altfunc");
+  void func1(void) __asm("altfunc");
   void func1(void) {} // "altfunc" in symbol table.
   void func2(void) {} // "_func2" in symbol table.
 

From 1553f90f93d30b41457097cf274c3791b182f316 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com>
Date: Mon, 10 Nov 2025 17:02:07 +0200
Subject: [PATCH 12/28] [MLIR][XeGPU][TransformOps] Add get_desc_op (#166801)

Add `transform.xegpu.get_desc_op` transform op that finds a
`xegpu.create_nd_tdesc` producer op of a `Value`.
---
 .../XeGPU/TransformOps/XeGPUTransformOps.td   | 28 ++++++--
 .../XeGPU/TransformOps/XeGPUTransformOps.cpp  | 65 +++++++++++++++++++
 mlir/python/mlir/dialects/transform/xegpu.py  | 21 ++++++
 mlir/test/Dialect/XeGPU/transform-ops.mlir    | 62 ++++++++++++++++++
 .../python/dialects/transform_xegpu_ext.py    | 17 ++++-
 5 files changed, 187 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
index b985d5450be0e..ed277ef7bd554 100644
--- a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
@@ -16,6 +16,24 @@ include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 
+def GetDescOp : Op<Transform_Dialect, "xegpu.get_desc_op", [
+  DeclareOpInterfaceMethods<TransformOpInterface>,
+  NavigationTransformOpTrait, MemoryEffectsOpInterface
+]> {
+
+  let summary = "Get a handle to the descriptor op of a value.";
+  let description = [{
+    Traces the producers of the given value until an `xegpu.create_nd_tdesc`
+    descriptor op is found. Returns a handle to it. Currently traces
+    producers by following only the first operand of producer ops.
+  }];
+
+  let arguments = (ins TransformValueHandleTypeInterface:$target);
+
+  let results = (outs TransformHandleTypeInterface:$descHandle);
+  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
+}
+
 def SetDescLayoutOp : Op<Transform_Dialect, "xegpu.set_desc_layout", [
   AttrSizedOperandSegments,
   DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
@@ -31,16 +49,16 @@ def SetDescLayoutOp : Op<Transform_Dialect, "xegpu.set_desc_layout", [
   }];
 
   let arguments = (ins
-                   TransformHandleTypeInterface : $target,
-                   Variadic<TransformAnyParamTypeOrAnyHandle> : $sg_layout,
-                   Variadic<TransformAnyParamTypeOrAnyHandle> : $sg_data,
-                   Variadic<TransformAnyParamTypeOrAnyHandle> : $inst_data,
+                   TransformHandleTypeInterface:$target,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$inst_data,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_layout,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_data,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_inst_data
                    );
 
-  let results = (outs TransformHandleTypeInterface : $transformed);
+  let results = (outs TransformHandleTypeInterface:$transformed);
   let builders = [
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<OpFoldResult>":$mixedSgLayout,
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
index 8943ba09d9c34..0683699f467e9 100644
--- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
@@ -13,6 +13,9 @@
 
 #include <optional>
 
+#include "llvm/Support/DebugLog.h"
+#define DEBUG_TYPE "xegpu-transforms"
+
 using namespace mlir;
 using namespace mlir::transform;
 
@@ -76,6 +79,45 @@ static DiagnosedSilenceableFailure convertMixedValuesToInt(
   return DiagnosedSilenceableFailure::success();
 }
 
+/// Find producer operation of type T for the given value.
+/// It's assumed that producer ops are chained through their first operand.
+/// Producer chain is traced trough loop block arguments (init values).
+template <typename T>
+static std::optional<T> findProducerOfType(Value val) {
+  Value currentValue = val;
+  if (!currentValue.getDefiningOp()) {
+    // Value may be a block argument initialized outside a loop.
+    if (val.getNumUses() == 0) {
+      LDBG() << "Failed to find producer op, value has no uses.";
+      return std::nullopt;
+    }
+    auto userOp = val.getUsers().begin();
+    auto parentLoop = userOp->getParentOfType<LoopLikeOpInterface>();
+    if (!parentLoop) {
+      LDBG() << "Failed to find producer op, not in a loop.";
+      return std::nullopt;
+    }
+    int64_t iterArgIdx;
+    if (auto iterArg = llvm::dyn_cast<BlockArgument>(currentValue)) {
+      auto numInductionVars = parentLoop.getLoopInductionVars()->size();
+      iterArgIdx = iterArg.getArgNumber() - numInductionVars;
+      currentValue = parentLoop.getInits()[iterArgIdx];
+    } else {
+      LDBG() << "Failed to find producer op, value not in init values.";
+      return std::nullopt;
+    }
+  }
+  Operation *producerOp = currentValue.getDefiningOp();
+
+  if (auto matchingOp = dyn_cast<T>(producerOp))
+    return matchingOp;
+
+  if (producerOp->getNumOperands() == 0)
+    return std::nullopt;
+
+  return findProducerOfType<T>(producerOp->getOperand(0));
+}
+
 /// Create a layout attribute from the given parameters.
 static xegpu::LayoutAttr
 createLayoutAttr(MLIRContext *ctx, ArrayRef<int32_t> sgLayout,
@@ -111,6 +153,29 @@ setDescLayout(transform::TransformRewriter &rewriter,
   return newDescOp;
 }
 
+DiagnosedSilenceableFailure
+transform::GetDescOp::apply(transform::TransformRewriter &rewriter,
+                            transform::TransformResults &results,
+                            transform::TransformState &state) {
+  auto targetValues = state.getPayloadValues(getTarget());
+  if (!llvm::hasSingleElement(targetValues)) {
+    return emitDefiniteFailure()
+           << "requires exactly one target value handle (got "
+           << llvm::range_size(targetValues) << ")";
+  }
+
+  auto maybeDescOp =
+      findProducerOfType<xegpu::CreateNdDescOp>(*targetValues.begin());
+  if (!maybeDescOp) {
+    return emitSilenceableFailure(getLoc())
+           << "Could not find a matching descriptor op when walking the "
+              "producer chain of the first operand.";
+  }
+
+  results.set(llvm::cast<OpResult>(getResult()), {*maybeDescOp});
+  return DiagnosedSilenceableFailure::success();
+}
+
 void transform::SetDescLayoutOp::build(OpBuilder &builder,
                                        OperationState &result, Value target,
                                        ArrayRef<OpFoldResult> mixedSgLayout,
diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py
index 2918bf592880a..d23f2ac16429f 100644
--- a/mlir/python/mlir/dialects/transform/xegpu.py
+++ b/mlir/python/mlir/dialects/transform/xegpu.py
@@ -7,6 +7,7 @@
 
 try:
     from ...ir import *
+    from ...dialects import transform
     from .._ods_common import _cext as _ods_cext
     from .._ods_common import (
         MixedValues,
@@ -20,6 +21,26 @@
 from typing import Union, Optional
 
 
+@_ods_cext.register_operation(_Dialect, replace=True)
+class GetDescOp(GetDescOp):
+    """Specialization for GetDescOp class."""
+
+    def __init__(
+        self,
+        target: Value,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        desc_type = transform.AnyOpType.get()
+        super().__init__(
+            desc_type,
+            target,
+            loc=loc,
+            ip=ip,
+        )
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class SetDescLayoutOp(SetDescLayoutOp):
     """Specialization for SetDescLayoutOp class."""
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
index 23e1cd946b4cd..342de429d2e90 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -1,5 +1,67 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
 
+// CHECK-LABEL: @get_desc_op_a
+func.func @get_desc_op_a(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  %c0 = arith.constant 0 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // expected-remark @below {{found desc op}}
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value
+    %2 = transform.xegpu.get_desc_op %1 : (!transform.any_value) -> !transform.any_op
+    transform.debug.emit_remark_at %2, "found desc op" : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @get_desc_op_c
+func.func @get_desc_op_c(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %c32 = arith.constant 32 : index
+  %c4096 = arith.constant 4096 : index
+  %c0 = arith.constant 0 : index
+  // expected-remark @below {{found desc op}}
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %1 = xegpu.load_nd %0[%c0, %c0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) {
+    %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+    %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+    %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+    scf.yield %7 : vector<256x256xf16>
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_operand %0[2] : (!transform.any_op) -> !transform.any_value
+    %2 = transform.xegpu.get_desc_op %1 : (!transform.any_value) -> !transform.any_op
+    transform.debug.emit_remark_at %2, "found desc op" : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK-LABEL: @set_desc_layout
 func.func @set_desc_layout(%arg0: memref<4096x4096xf16>) {
   // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0
diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py
index 1c8a2bcc6a2fb..f83c807f571e1 100644
--- a/mlir/test/python/dialects/transform_xegpu_ext.py
+++ b/mlir/test/python/dialects/transform_xegpu_ext.py
@@ -3,7 +3,7 @@
 from mlir.ir import *
 from mlir.dialects import transform
 from mlir.dialects.transform import xegpu
-from mlir.dialects.transform import structured
+from mlir.dialects.transform import AnyValueType
 
 
 def run(f):
@@ -16,6 +16,21 @@ def run(f):
     return f
 
 
+@run
+def getDescOpDefaultIndex():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0])
+        desc_handle = xegpu.GetDescOp(operand)
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: getDescOpDefaultIndex
+    # CHECK: transform.xegpu.get_desc_op %
+
+
 @run
 def setDescLayoutMinimal():
     sequence = transform.SequenceOp(

From 71cdd40eec9a1aa055e1491cf7e45817c9b67c15 Mon Sep 17 00:00:00 2001
From: GrumpyPigSkin <130710602+GrumpyPigSkin@users.noreply.github.com>
Date: Mon, 10 Nov 2025 15:04:35 +0000
Subject: [PATCH 13/28] Allow avx512 bw masked intrinsics to be used in
 constexpr (#162871)

Added CONSTEXPR macro and test for the following intrinsics:

-- _mm_mask_adds_epi16 _mm_maskz_adds_epi16
-- _mm_mask_adds_epi8 _mm_maskz_adds_epi8
-- _mm_mask_adds_epu16 _mm_maskz_adds_epu16
-- _mm_mask_adds_epu8 _mm_maskz_adds_epu8
-- _mm_mask_broadcastb_epi8 _mm_maskz_broadcastb_epi8
-- _mm_mask_broadcastw_epi16 _mm_maskz_broadcastw_epi16
-- _mm_mask_cvtepi8_epi16 _mm_maskz_cvtepi8_epi16
-- _mm_mask_cvtepu8_epi16 _mm_maskz_cvtepu8_epi16
-- _mm_mask_packs_epi16 _mm_maskz_packs_epi16
-- _mm_mask_packs_epi32 _mm_maskz_packs_epi32
-- _mm_mask_packus_epi16 _mm_maskz_packus_epi16
-- _mm_mask_packus_epi32 _mm_maskz_packus_epi32
-- _mm_mask_set1_epi16 _mm_maskz_set1_epi16
-- _mm_mask_set1_epi8 _mm_maskz_set1_epi8
-- _mm_mask_slli_epi16 _mm_mask_slli_epi16
-- _mm_mask_subs_epi16 _mm_maskz_subs_epi16
-- _mm_mask_subs_epi8 _mm_maskz_subs_epi8
-- _mm_mask_subs_epu16 _mm_maskz_subs_epu16
-- _mm_mask_subs_epu8 _mm_maskz_subs_epu8
-- _mm_mask_unpackhi_epi16 _mm_maskz_unpackhi_epi16
-- _mm_mask_unpackhi_epi8 _mm_maskz_unpackhi_epi8
-- _mm_mask_unpacklo_epi16 _mm_maskz_unpacklo_epi16
-- _mm_mask_unpacklo_epi8 _mm_maskz_unpacklo_epi8

-- _mm256_mask_adds_epi16 _mm256_maskz_adds_epi16
-- _mm256_mask_adds_epi8 _mm256_maskz_adds_epi8
-- _mm256_mask_adds_epu16 _mm256_maskz_adds_epu16
-- _mm256_mask_adds_epu8 _mm256_maskz_adds_epu8
-- _mm256_mask_broadcastb_epi8 _mm256_maskz_broadcastb_epi8
-- _mm256_mask_broadcastw_epi16 _mm256_maskz_broadcastw_epi16
-- _mm256_mask_cvtepi8_epi16 _mm256_maskz_cvtepi8_epi16
-- _mm256_mask_cvtepu8_epi16 _mm256_maskz_cvtepu8_epi16
-- _mm256_mask_packs_epi16 _mm256_maskz_packs_epi16
-- _mm256_mask_packs_epi32 _mm256_maskz_packs_epi32
-- _mm256_mask_packus_epi16 _mm256_maskz_packus_epi16
-- _mm256_mask_packus_epi32 _mm256_maskz_packus_epi32
-- _mm256_mask_set1_epi16 _mm256_maskz_set1_epi16
-- _mm256_mask_set1_epi8 _mm256_maskz_set1_epi8
-- _mm256_mask_slli_epi16 _mm256_mask_slli_epi16
-- _mm256_mask_subs_epi16 _mm256_maskz_subs_epi16
-- _mm256_mask_subs_epi8 _mm256_maskz_subs_epi8
-- _mm256_mask_subs_epu16 _mm256_maskz_subs_epu16
-- _mm256_mask_subs_epu8 _mm256_maskz_subs_epu8
-- _mm256_mask_unpackhi_epi16 _mm256_maskz_unpackhi_epi16
-- _mm256_mask_unpackhi_epi8 _mm256_maskz_unpackhi_epi8
-- _mm256_mask_unpacklo_epi16 _mm256_maskz_unpacklo_epi16
-- _mm256_mask_unpacklo_epi8 _mm256_maskz_unpacklo_epi8

-- _mm512_mask_adds_epi16 _mm512_maskz_adds_epi16
-- _mm512_mask_adds_epi8 _mm512_maskz_adds_epi8
-- _mm512_mask_adds_epu16 _mm512_maskz_adds_epu16
-- _mm512_mask_adds_epu8 _mm512_maskz_adds_epu8
-- _mm512_mask_broadcastb_epi8 _mm512_maskz_broadcastb_epi8
-- _mm512_mask_broadcastw_epi16 _mm512_maskz_broadcastw_epi16
-- _mm512_mask_mov_epi16 _mm512_maskz_mov_epi16
-- _mm512_mask_mov_epi8 _mm512_maskz_mov_epi8
-- _mm512_mask_packs_epi16 _mm512_maskz_packs_epi16
-- _mm512_mask_packs_epi32 _mm512_maskz_packs_epi32
-- _mm512_mask_packus_epi16 _mm512_maskz_packus_epi16
-- _mm512_mask_packus_epi32 _mm512_maskz_packus_epi32
-- _mm512_mask_set1_epi16 _mm512_maskz_set1_epi16
-- _mm512_mask_set1_epi8 _mm512_maskz_set1_epi8
-- _mm512_mask_subs_epi16 _mm512_maskz_subs_epi16
-- _mm512_mask_subs_epi8 _mm512_maskz_subs_epi8
-- _mm512_mask_subs_epu16 _mm512_maskz_subs_epu16
-- _mm512_mask_subs_epu8 _mm512_maskz_subs_epu8
-- _mm512_mask_unpackhi_epi16 _mm512_maskz_unpackhi_epi16
-- _mm512_mask_unpackhi_epi8 _mm512_maskz_unpackhi_epi8
-- _mm512_mask_unpacklo_epi16 _mm512_maskz_unpacklo_epi16
-- _mm512_mask_unpacklo_epi8 _mm512_maskz_unpacklo_epi8

closes #162070
---
 clang/lib/Headers/avx512bwintrin.h           | 118 ++++++------
 clang/lib/Headers/avx512vlbwintrin.h         | 182 +++++++++----------
 clang/test/CodeGen/X86/avx512bw-builtins.c   |  80 +++++++-
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 169 +++++++++++++----
 4 files changed, 356 insertions(+), 193 deletions(-)

diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 203b600078842..4a02c96620335 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -511,7 +511,7 @@ _mm512_packs_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
@@ -519,9 +519,8 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
                                        (__v32hi)_mm512_packs_epi32(__A, __B),
                                        (__v32hi)__W);
@@ -532,7 +531,7 @@ _mm512_packs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -540,7 +539,7 @@ _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -553,7 +552,7 @@ _mm512_packus_epi32(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
@@ -561,7 +560,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
                                        (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
@@ -574,7 +573,7 @@ _mm512_packus_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -582,7 +581,7 @@ _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
@@ -595,17 +594,15 @@ _mm512_adds_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_adds_epi8(__A, __B),
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_adds_epi8(__A, __B),
                                         (__v64qi)_mm512_setzero_si512());
@@ -616,7 +613,7 @@ _mm512_adds_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -624,7 +621,7 @@ _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -637,7 +634,7 @@ _mm512_adds_epu8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -645,7 +642,7 @@ _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -658,17 +655,15 @@ _mm512_adds_epu16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_adds_epu16(__A, __B),
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                         (__v32hi)_mm512_adds_epu16(__A, __B),
                                         (__v32hi)_mm512_setzero_si512());
@@ -886,7 +881,7 @@ _mm512_subs_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -894,7 +889,7 @@ _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -907,7 +902,7 @@ _mm512_subs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -915,7 +910,7 @@ _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -928,7 +923,7 @@ _mm512_subs_epu8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -936,7 +931,7 @@ _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
@@ -949,7 +944,7 @@ _mm512_subs_epu16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -957,7 +952,7 @@ _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
@@ -1191,14 +1186,14 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
                                           62, 64+62, 63, 64+63);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpackhi_epi8(__A, __B),
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpackhi_epi8(__A, __B),
@@ -1218,14 +1213,14 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
                                           30, 32+30, 31, 32+31);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpackhi_epi16(__A, __B),
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpackhi_epi16(__A, __B),
@@ -1253,14 +1248,14 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
                                           54, 64+54, 55, 64+55);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpacklo_epi8(__A, __B),
                                         (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                         (__v64qi)_mm512_unpacklo_epi8(__A, __B),
@@ -1280,14 +1275,14 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
                                           26, 32+26, 27, 32+27);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpacklo_epi16(__A, __B),
                                        (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                        (__v32hi)_mm512_unpacklo_epi16(__A, __B),
@@ -1566,7 +1561,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) {
   ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a),         \
                                                 (int)(imm)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
@@ -1574,23 +1569,21 @@ _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
                 (__v32hi) __W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
   return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
                 (__v32hi) __A,
                 (__v32hi) _mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
   return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
                 (__v64qi) __A,
                 (__v64qi) __W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
 {
   return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
@@ -1598,7 +1591,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
                 (__v64qi) _mm512_setzero_si512 ());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
 {
   return (__m512i) __builtin_ia32_selectb_512(__M,
@@ -1606,9 +1599,8 @@ _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
                                               (__v64qi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
   return (__m512i) __builtin_ia32_selectb_512(__M,
                                               (__v64qi) _mm512_set1_epi8(__A),
                                               (__v64qi) _mm512_setzero_si512());
@@ -1801,7 +1793,7 @@ _mm512_broadcastb_epi8(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectb_512(__M,
@@ -1809,15 +1801,14 @@ _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
                                              (__v64qi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectb_512(__M,
                                              (__v64qi) _mm512_broadcastb_epi8(__A),
                                              (__v64qi) _mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
 {
   return (__m512i) __builtin_ia32_selectw_512(__M,
@@ -1825,9 +1816,8 @@ _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
                                               (__v32hi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   return (__m512i) __builtin_ia32_selectw_512(__M,
                                               (__v32hi) _mm512_set1_epi16(__A),
                                               (__v32hi) _mm512_setzero_si512());
@@ -1840,7 +1830,7 @@ _mm512_broadcastw_epi16(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectw_512(__M,
@@ -1848,7 +1838,7 @@ _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
                                              (__v32hi) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
 {
   return (__m512i)__builtin_ia32_selectw_512(__M,
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 575c0c8962662..d23188ab02b6c 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -536,14 +536,14 @@ _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) {
                                              (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
                                              (__v8hi)_mm_packs_epi32(__A, __B),
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
@@ -551,7 +551,7 @@ _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -559,7 +559,7 @@ _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -567,7 +567,7 @@ _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
                                           (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -575,7 +575,7 @@ _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -583,7 +583,7 @@ _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -591,7 +591,7 @@ _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
                                           (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -599,7 +599,7 @@ _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
                                           (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
@@ -607,7 +607,7 @@ _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
@@ -615,7 +615,7 @@ _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -623,7 +623,7 @@ _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
                                          (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
@@ -631,7 +631,7 @@ _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
                                          (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -639,7 +639,7 @@ _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
@@ -647,7 +647,7 @@ _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
                                             (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -655,7 +655,7 @@ _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
                                          (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
@@ -663,7 +663,7 @@ _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
                                          (__v32qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -671,7 +671,7 @@ _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -679,7 +679,7 @@ _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -687,7 +687,7 @@ _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -695,7 +695,7 @@ _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -703,7 +703,7 @@ _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -711,7 +711,7 @@ _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -719,7 +719,7 @@ _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -727,7 +727,7 @@ _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -735,7 +735,7 @@ _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -743,7 +743,7 @@ _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -751,7 +751,7 @@ _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -759,7 +759,7 @@ _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -767,7 +767,7 @@ _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -775,7 +775,7 @@ _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -783,7 +783,7 @@ _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1095,7 +1095,7 @@ _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
                                          (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1103,7 +1103,7 @@ _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1111,7 +1111,7 @@ _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1119,7 +1119,7 @@ _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1127,7 +1127,7 @@ _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1135,7 +1135,7 @@ _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1143,7 +1143,7 @@ _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1151,7 +1151,7 @@ _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1159,7 +1159,7 @@ _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1167,7 +1167,7 @@ _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
@@ -1175,7 +1175,7 @@ _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
                                              (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1183,7 +1183,7 @@ _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
@@ -1191,7 +1191,7 @@ _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
                                             (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1199,7 +1199,7 @@ _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1207,7 +1207,7 @@ _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
       __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1215,7 +1215,7 @@ _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
                                            (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1432,14 +1432,14 @@ _mm_cvtepi16_epi8(__m128i __A) {
       12, 13, 14, 15);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                (__v16qi) __O,
                __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
   return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
                (__v16qi) _mm_setzero_si128(),
@@ -1588,112 +1588,112 @@ _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
       (__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpackhi_epi8(__A, __B),
                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpackhi_epi8(__A, __B),
                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                            (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpackhi_epi16(__A, __B),
                                            (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                        (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpackhi_epi16(__A, __B),
                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                            (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                            (__v16qi)_mm_unpacklo_epi8(__A, __B),
                                            (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                         (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                         (__v32qi)_mm256_unpacklo_epi8(__A, __B),
                                         (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                            (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                            (__v8hi)_mm_unpacklo_epi16(__A, __B),
                                            (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                        (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                        (__v16hi)_mm256_unpacklo_epi16(__A, __B),
                                        (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1701,7 +1701,7 @@ _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1709,7 +1709,7 @@ _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1717,7 +1717,7 @@ _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
                                              (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1726,7 +1726,7 @@ _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
 }
 
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1734,7 +1734,7 @@ _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1742,7 +1742,7 @@ _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1750,7 +1750,7 @@ _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
                                              (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1877,7 +1877,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
                                           (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -1885,7 +1885,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
@@ -2173,7 +2173,7 @@ _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) {
                 (__v32qi) _mm256_setzero_si256 ());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
 {
   return (__m128i) __builtin_ia32_selectb_128(__M,
@@ -2181,7 +2181,7 @@ _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
                                               (__v16qi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
 {
  return (__m128i) __builtin_ia32_selectb_128(__M,
@@ -2189,7 +2189,7 @@ _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
                                              (__v16qi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
 {
   return (__m256i) __builtin_ia32_selectb_256(__M,
@@ -2197,7 +2197,7 @@ _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
                                               (__v32qi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
 {
   return (__m256i) __builtin_ia32_selectb_256(__M,
@@ -2528,7 +2528,7 @@ _mm256_movm_epi16 (__mmask16 __A)
   return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectb_128(__M,
@@ -2536,7 +2536,7 @@ _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
                                              (__v16qi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectb_128(__M,
@@ -2544,7 +2544,7 @@ _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
                                              (__v16qi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectb_256(__M,
@@ -2552,7 +2552,7 @@ _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
                                              (__v32qi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectb_256(__M,
@@ -2560,7 +2560,7 @@ _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
                                              (__v32qi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128(__M,
@@ -2568,7 +2568,7 @@ _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
                                              (__v8hi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
 {
   return (__m128i)__builtin_ia32_selectw_128(__M,
@@ -2576,7 +2576,7 @@ _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
                                              (__v8hi) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256(__M,
@@ -2584,7 +2584,7 @@ _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
                                              (__v16hi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
 {
   return (__m256i)__builtin_ia32_selectw_256(__M,
@@ -2592,7 +2592,7 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
                                              (__v16hi) _mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
 {
   return (__m256i) __builtin_ia32_selectw_256 (__M,
@@ -2600,7 +2600,7 @@ _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
                                                (__v16hi) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
 {
   return (__m256i) __builtin_ia32_selectw_256(__M,
@@ -2608,7 +2608,7 @@ _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
                                               (__v16hi) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
 {
   return (__m128i) __builtin_ia32_selectw_128(__M,
@@ -2616,7 +2616,7 @@ _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
                                               (__v8hi) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_set1_epi16 (__mmask8 __M, short __A)
 {
   return (__m128i) __builtin_ia32_selectw_128(__M,
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 834e140018c34..0b73c7b14d869 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1205,12 +1205,16 @@ __m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_packs_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packs_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-42,0,32767,0,-32768,0,1,0,30000,0,-32768,0,-32768,0,-32768,0,-32768,0,-22222,0,-32768));
+
 __m512i test_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_packs_epi32
   // CHECK: @llvm.x86.avx512.packssdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_packs_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_packs_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,9,-32768,11,-42,13,32767,15,-32768,17,1,19,30000,21,-32768,23,-32768,25,-32768,27,-32768,29,-22222,31,-32768));
+
 __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
@@ -1223,48 +1227,62 @@ __m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_packs_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_packs_epi16((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,17,-128,19,-128,21,-128,23,-90,25,-5,27,-100,29,-128,31,-128,33,-128,35,-1,37,-127,39,-128,41,2,43,127,45,127,47,42,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128));
+
 __m512i test_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_packs_epi16
   // CHECK: @llvm.x86.avx512.packsswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_packs_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packs_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-128,0,-128,0,-90,0,-5,0,-100,0,-128,0,-128,0,-128,0,-1,0,-127,0,-128,0,2,0,127,0,127,0,42,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128));
+
 __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   return _mm512_packus_epi32(__A,__B); 
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0));
+
 __m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_packus_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packus_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,-1,0,0,0,1,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
+
 __m512i test_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_packus_epi32
   // CHECK: @llvm.x86.avx512.packusdw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_packus_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_packus_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,9,0,11,0,13,-1,15,0,17,1,19,-1,21,0,23,0,25,0,27,0,29,0,31,0));
+
 __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   return _mm512_packus_epi16(__A,__B); 
 }
 TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0));
+
 __m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_packus_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_packus_epi16((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,17,42,19,-1,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,127,37,-1,39,0,41,1,43,-1,45,-128,47,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0));
+
 __m512i test_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_packus_epi16
   // CHECK: @llvm.x86.avx512.packuswb.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_packus_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packus_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
+
 __m512i test_mm512_adds_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
@@ -1278,18 +1296,22 @@ __m512i test_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
  return _mm512_mask_adds_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_adds_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,+58,31,+62,33,+66,35,+70,37,+74,39,+78,41,+82,43,+86,45,+90,47,+94,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10));
+
 __m512i test_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_adds_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_adds_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,+58,0,+62,0,+66,0,+70,0,+74,0,+78,0,+82,0,+86,0,+90,0,+94,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10));
+
 __m512i test_mm512_adds_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
  return _mm512_adds_epi16(__A,__B); 
 }
-TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +32000, -32000, +32000, -32000}, (__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +800, -800, -800, +800}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, +32767, -32768, +31200, -31200));
+TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,-32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,-4,+6,-8,+10,-12,+14,-16,+18,-20,+22,-24,+26,-28,+30,-32,+34,-36,+38,-40,+42,-44,+46,-48,+50,-52,+54,+32767,-32768,+31200,-31200));
 
 __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_adds_epi16
@@ -1297,12 +1319,16 @@ __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_adds_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_adds_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,-32768,31,+32767));
+
 __m512i test_mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_adds_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_adds_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,-32768,0,+32767));
+
 __m512i test_mm512_adds_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512
@@ -1318,7 +1344,7 @@ __m512i test_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_adds_epu8(__W,__U,__A,__B); 
 }
-TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
+TEST_CONSTEXPR(match_v64qu(_mm512_mask_adds_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,17,+127,19,+191,21,+255,23,+255,25,+190,27,+254,29,+255,31,+255,33,+191,35,+255,37,+255,39,+255,41,+254,43,+255,45,+255,47,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255));
 
 __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epu8
@@ -1327,12 +1353,16 @@ __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_adds_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qu(_mm512_maskz_adds_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+127,0,+191,0,+255,0,+255,0,+190,0,+254,0,+255,0,+255,0,+191,0,+255,0,+255,0,+255,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255));
+
 __m512i test_mm512_adds_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
   // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_adds_epu16(__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535));
+
 __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
@@ -1340,6 +1370,8 @@ __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m5
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_adds_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_mask_adds_epu16((__m512i)(__v32hu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,9,+65535,11,+49151,13,+65535,15,+65535,17,+49152,19,+65535,21,+65535,23,+65535,25,+65535,27,+65535,29,+65535,31,+65535));
+
 __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512
@@ -1347,6 +1379,8 @@ __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_adds_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_maskz_adds_epu16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+49151,0,+65535,0,+65535,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535));
+
 __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_avg_epu8
   // CHECK: @llvm.x86.avx512.pavg.b.512
@@ -1640,12 +1674,16 @@ __m512i test_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_mask_subs_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_subs_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,17,-128,19,0,21,-44,23,127,25,-128,27,0,29,-60,31,127,33,-128,35,0,37,-76,39,127,41,-128,43,0,45,-92,47,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127));
+
 __m512i test_mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_maskz_subs_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_subs_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-44,0,127,0,-128,0,0,0,-60,0,127,0,-128,0,0,0,-76,0,127,0,-128,0,0,0,-92,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127));
+
 __m512i test_mm512_subs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
@@ -1658,18 +1696,24 @@ __m512i test_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_mask_subs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_subs_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,9,-32768,11,32767,13,-28,15,32,17,-32768,19,32767,21,-44,23,48,25,-32768,27,32767,29,-60,31,64));
+
 __m512i test_mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v32i16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_subs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_subs_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-28,0,32,0,-32768,0,32767,0,-44,0,48,0,-32768,0,32767,0,-60,0,64));
+
 __m512i test_mm512_subs_epu8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
   // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
 return _mm512_subs_epu8(__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,0,0,0,0,0,0,0,+63,0,0,0,0,0,0,0,+64,+1,0,0,0,0,0,0,+127,+64,+63,0,0,0,0,0,+128,+65,+64,+1,0,0,0,0,+191,+128,+127,+64,+63,0,0,0,+192,+129,+128,+65,+64,+1,0,0,+255,+192,+191,+128,+127,+64,+63,+0));
+
 __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512
@@ -1677,7 +1721,7 @@ __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_mask_subs_epu8(__W,__U,__A,__B); 
 }
-TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, 0, 0, 0, 0, 0, 0, 0, +63, 0, 0, 0, 0, 0, 0, 0, +64, +1, 0, 0, 0, 0, 0, 0, +127, +64, +63, 0, 0, 0, 0, 0, +128, +65, +64, +1, 0, 0, 0, 0, +191, +128, +127, +64, +63, 0, 0, 0, +192, +129, +128, +65, +64, +1, 0, 0, +255, +192, +191, +128, +127, +64, +63, +0));
+TEST_CONSTEXPR(match_v64qu(_mm512_mask_subs_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,17,200,19,0,21,0,23,254,25,0,27,1,29,1,31,0,33,200,35,0,37,0,39,254,41,0,43,1,45,1,47,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0));
 
 __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epu8
@@ -1686,20 +1730,25 @@ __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
 return _mm512_maskz_subs_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v64qu(_mm512_maskz_subs_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
+
 __m512i test_mm512_subs_epu16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 return _mm512_subs_epu16(__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
+
 __m512i test_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
   // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_mask_subs_epu16(__W,__U,__A,__B); 
-TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0));
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_mask_subs_epu16((__m512i)(__v32hu){101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000,0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000,0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,109,0,111,0,113,1,115,25000,117,60000,119,0,121,0,123,65534,125,0,127,0,129,1,131,25000));
+
 __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512
@@ -1707,6 +1756,8 @@ __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
 return _mm512_maskz_subs_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hu(_mm512_maskz_subs_epu16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){51,65000,0,40000,0,100,0,65535,42,0,0,1000,0,1,0,50000,69,65000,0,40000,0,100,0,65535,71,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){2652,5000,0,40000,0,200,0,1,398,1,0,65535,0,0,0,25000,29625,5000,0,40000,0,200,0,1,25274,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000,0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000));
+
 __m512i test_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask2_permutex2var_epi16
   // CHECK: @llvm.x86.avx512.vpermi2var.hi.512
@@ -2041,6 +2092,7 @@ __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, _
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpackhi_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,17,-25,19,-26,21,-27,23,-28,25,-29,27,-30,29,-31,31,-32,33,-41,35,-42,37,-43,39,-44,41,-45,43,-46,45,-47,47,-48,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64));
 
 __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpackhi_epi8
@@ -2048,6 +2100,7 @@ __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpackhi_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpackhi_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-25,0,-26,0,-27,0,-28,0,-29,0,-30,0,-31,0,-32,0,-41,0,-42,0,-43,0,-44,0,-45,0,-46,0,-47,0,-48,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64));
 
 __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_unpackhi_epi16
@@ -2063,6 +2116,7 @@ __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpackhi_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,9,214,11,215,13,216,15,217,17,224,19,225,21,226,23,227,25,234,27,235,136,236,137,237));
 
 __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpackhi_epi16
@@ -2070,6 +2124,7 @@ __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpackhi_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpackhi_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,214,0,215,0,216,0,217,0,224,0,225,0,226,0,227,0,234,0,235,136,236,137,237));
 
 __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_unpacklo_epi8
@@ -2084,6 +2139,7 @@ __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, _
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpacklo_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,17,20,19,21,21,22,23,23,25,24,27,25,29,26,31,27,33,40,35,41,37,42,39,43,41,44,43,45,45,46,47,47,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67));
 
 __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpacklo_epi8
@@ -2091,6 +2147,7 @@ __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpacklo_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpacklo_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67));
 
 __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_unpacklo_epi16
@@ -2105,6 +2162,7 @@ __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A,
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpacklo_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,9,210,11,211,13,212,15,213,17,220,19,221,21,222,23,223,25,230,27,231,132,232,133,233));
 
 __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_unpacklo_epi16
@@ -2112,6 +2170,7 @@ __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpacklo_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpacklo_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,210,0,211,0,212,0,213,0,220,0,221,0,222,0,223,0,230,0,231,132,232,133,233));
 
 __m512i test_mm512_cvtepi8_epi16(__m256i __A) {
   // CHECK-LABEL: test_mm512_cvtepi8_epi16
@@ -2499,24 +2558,28 @@ __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mov_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_mov_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31));
 
 __m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_mov_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mov_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mov_epi16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31));
 
 __m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_mov_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_mov_epi8(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_mov_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31,32,-33,34,-35,36,-37,38,-39,40,-41,42,-43,44,-45,46,-47,48,-49,50,-51,52,-53,54,-55,56,-57,58,-59,60,-61,62,-63));
 
 __m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_mov_epi8
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_mov_epi8(__U, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_mov_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31,0,-33,0,-35,0,-37,0,-39,0,-41,0,-43,0,-45,0,-47,0,-49,0,-51,0,-53,0,-55,0,-57,0,-59,0,-61,0,-63));
 
 __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
   // CHECK-LABEL: test_mm512_mask_set1_epi8
@@ -2585,6 +2648,7 @@ __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_set1_epi8(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_set1_epi8((__m512i)(__v64qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,17,42,19,42,21,42,23,42,25,42,27,42,29,42,31,42,33,42,35,42,37,42,39,42,41,42,43,42,45,42,47,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42));
 
 __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
   // CHECK-LABEL: test_mm512_maskz_set1_epi8
@@ -2655,6 +2719,7 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_set1_epi8(__M, __A); 
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_set1_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
 
 __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
   // CHECK-LABEL: test_mm512_kunpackd
@@ -2830,6 +2895,7 @@ __m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A)
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_broadcastb_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_broadcastb_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120,32,-120,34,-120,36,-120,38,-120,40,-120,42,-120,44,-120,46,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120));
 
 __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcastb_epi8
@@ -2837,6 +2903,7 @@ __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_broadcastb_epi8(__M, __A);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_broadcastb_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m512i test_mm512_broadcastw_epi16(__m128i __A) {
   // CHECK-LABEL: test_mm512_broadcastw_epi16
@@ -2878,6 +2945,7 @@ __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_broadcastw_epi16(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_broadcastw_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120));
 
 __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcastw_epi16
@@ -2885,6 +2953,7 @@ __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_broadcastw_epi16(__M, __A);
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_broadcastw_epi16((__mmask32)0xAAAAAAAAu,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
   // CHECK-LABEL: test_mm512_mask_set1_epi16
@@ -2923,6 +2992,7 @@ __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_set1_epi16(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_set1_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,-1),1,-1,3,-1,5,-1,7,-1,9,-1,11,-1,13,-1,15,-1,17,-1,19,-1,21,-1,23,-1,25,-1,27,-1,29,-1,31,-1));
 
 __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   // CHECK-LABEL: test_mm512_maskz_set1_epi16
@@ -2961,6 +3031,8 @@ __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_set1_epi16(__M, __A); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_set1_epi16((__mmask32)0xAAAAAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
+
 __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.512
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 172a3cb219c8a..28e6afbc24564 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -941,56 +941,28 @@ __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_blend_epi8(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v16qi(
-  _mm_mask_blend_epi8(
-    (__mmask16)0x0001,
-    (__m128i)(__v16qi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
-    (__m128i)(__v16qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v16qi(_mm_mask_blend_epi8((__mmask16)0x0001,(__m128i)(__v16qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m128i)(__v16qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
 
 __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_blend_epi8(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v32qi(
-  _mm256_mask_blend_epi8(
-    (__mmask32) 0x00000001,
-    (__m256i)(__v32qi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
-    (__m256i)(__v32qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_blend_epi8((__mmask32)0x00000001,(__m256i)(__v32qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v32qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
 
 __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: test_mm_mask_blend_epi16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_blend_epi16(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v8hi(
-  _mm_mask_blend_epi16(
-    (__mmask8)0x01,
-    (__m128i)(__v8hi){2, 2, 2, 2, 2, 2, 2, 2},
-    (__m128i)(__v8hi){ 10,11,12,13,14,15,16,17 }
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_blend_epi16((__mmask8)0x01,(__m128i)(__v8hi){2,2,2,2,2,2,2,2},(__m128i)(__v8hi){10,11,12,13,14,15,16,17}),10,2,2,2,2,2,2,2));
 
 __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: test_mm256_mask_blend_epi16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_blend_epi16(__U,__A,__W); 
 }
-TEST_CONSTEXPR(match_v16hi(
-  _mm256_mask_blend_epi16(
-    (__mmask16)0x0001,
-    (__m256i)(__v16hi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2},
-    (__m256i)(__v16hi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 }
-  ),
-  10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
-));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_blend_epi16((__mmask16)0x0001,(__m256i)(__v16hi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v16hi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2));
 
 __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_abs_epi8
@@ -1078,48 +1050,63 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packs_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_packs_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),0,-32768,0,-32768,0,32767,0,-32768));
+
 __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packs_epi32
   // CHECK: @llvm.x86.sse2.packssdw
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_packs_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_packs_epi32((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),1,-32768,3,-32768,29,32767,31,-32768));
+
 __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packs_epi32
   // CHECK: @llvm.x86.avx2.packssdw
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packs_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packs_epi32((__mmask32)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-32768,0,-22222,0,-32768));
+
 __m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packs_epi32
   // CHECK: @llvm.x86.avx2.packssdw
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_packs_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_packs_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,25,-32768,27,-32768,29,-22222,31,-32768));
+
 __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_packs_epi16
   // CHECK: @llvm.x86.sse2.packsswb
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packs_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_packs_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-128,0,1,0,127,0,90,0,-128));
+
 __m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packs_epi16
   // CHECK: @llvm.x86.sse2.packsswb
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_packs_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_packs_epi16((__m128i)(__v16qi){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-128,57,1,59,127,61,90,63,-128));
+
 __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packs_epi16
   // CHECK: @llvm.x86.avx2.packsswb
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packs_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packs_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128));
+
 __m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packs_epi16
   // CHECK: @llvm.x86.avx2.packsswb
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packs_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_packs_epi16((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128));
 
 __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packus_epi32
@@ -1127,6 +1114,7 @@ __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_packus_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_mask_packus_epi32((__m128i)(__v8hu){1,2,3,4,5,6,7,8},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),1,0,3,0,5,1,7,65535));
 
 __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_packus_epi32
@@ -1134,6 +1122,7 @@ __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_packus_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_packus_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),0,0,0,0,0,1,0,65535));
 
 __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packus_epi32
@@ -1141,6 +1130,7 @@ __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_packus_epi32(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packus_epi32((__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,0,0,0));
 
 __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packus_epi32
@@ -1148,6 +1138,7 @@ __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_packus_epi32(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_packus_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,25,0,27,0,29,0,31,0));
 
 __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_packus_epi16
@@ -1155,6 +1146,7 @@ __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_packus_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_maskz_packus_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),0,0,0,127,0,255,0,0,0,1,0,255,0,128,0,0));
 
 __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_packus_epi16
@@ -1162,6 +1154,7 @@ __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m12
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_packus_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_mask_packus_epi16((__m128i)(__v16qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),1,0,3,127,5,255,7,0,9,1,11,255,13,128,15,0));
 
 __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_packus_epi16
@@ -1169,6 +1162,7 @@ __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_packus_epi16(__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packus_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0));
 
 __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_packus_epi16
@@ -1176,6 +1170,7 @@ __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_packus_epi16(__W,__M,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_packus_epi16((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0));
 
 __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epi8
@@ -1183,48 +1178,64 @@ __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_adds_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_adds_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),1,+2,3,+6,5,+10,7,+14,57,+30,59,+90,61,+127,63,-128));
+
 __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v16i8
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_adds_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),0,+2,0,+6,0,+10,0,+14,0,+30,0,+90,0,+127,0,-128));
+
 __m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epi8
   // CHECK: @llvm.sadd.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_adds_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_adds_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10));
+
 __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epi8
   // CHECK: @llvm.sadd.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_adds_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10));
+
 __m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epi16
   // CHECK: @llvm.sadd.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_adds_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_adds_epi16((__m128i)(__v8hi){9,10,11,12,13,14,15,16,},(__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),9,+50,11,+54,13,-32768,15,+32767));
+
 __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_adds_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),0,+50,0,+54,0,-32768,0,+32767));
+
 __m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epi16
   // CHECK: @llvm.sadd.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_adds_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_adds_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,},(__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+50,11,+54,13,-32768,15,+32767));
+
 __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epi16
   // CHECK: @llvm.sadd.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_adds_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+50,0,+54,0,-32768,0,+32767));
+
 __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epu8
   // CHECK-NOT: @llvm.x86.sse2.paddus.b
@@ -1232,6 +1243,8 @@ __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_adds_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_mask_adds_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,57,+255,59,+255,61,+255,63,+255));
+
 __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epu8
   // CHECK-NOT: @llvm.x86.sse2.paddus.b
@@ -1239,6 +1252,8 @@ __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_adds_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_maskz_adds_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+255,0,+255,0,+255,0,+255));
+
 __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epu8
   // CHECK-NOT: @llvm.x86.avx2.paddus.b
@@ -1246,6 +1261,8 @@ __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_adds_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_mask_adds_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255));
+
 __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epu8
   // CHECK-NOT: @llvm.x86.avx2.paddus.b
@@ -1253,6 +1270,8 @@ __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_adds_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_maskz_adds_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255));
+
 __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.sse2.paddus.w
@@ -1260,6 +1279,8 @@ __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_adds_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_mask_adds_epu16((__m128i)(__v8hu){25,26,27,28,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),25,+32768,27,+49152,29,+65535,31,+65535));
+
 __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.sse2.paddus.w
@@ -1267,6 +1288,8 @@ __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_adds_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_adds_epu16((__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),0,+32768,0,+49152,0,+65535,0,+65535));
+
 __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_adds_epu16
   // CHECK-NOT: @llvm.x86.avx2.paddus.w
@@ -1274,6 +1297,8 @@ __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_adds_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hu(_mm256_mask_adds_epu16((__m256i)(__v16hu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,25,+65535,27,+65535,29,+65535,31,+65535));
+
 __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_adds_epu16
   // CHECK-NOT: @llvm.x86.avx2.paddus.w
@@ -1281,6 +1306,8 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_adds_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hu(_mm256_maskz_adds_epu16((__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535));
+
 __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_avg_epu8
   // CHECK: @llvm.x86.sse2.pavg.b
@@ -1740,48 +1767,64 @@ __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_subs_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_subs_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,57,-128,59,0,61,-124,63,127));
+
 __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v16i8
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_subs_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-124,0,127));
+
 __m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epi8
   // CHECK: @llvm.ssub.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_subs_epi8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_subs_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127));
+
 __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epi8
   // CHECK: @llvm.ssub.sat.v32i8
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epi8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_subs_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127));
+
 __m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epi16
   // CHECK: @llvm.ssub.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_subs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_subs_epi16((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),1,-32768,3,32767,29,-60,31,64));
+
 __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v8i16
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_subs_epi16((__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),0,-32768,0,32767,0,-60,0,64));
+
 __m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epi16
   // CHECK: @llvm.ssub.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_subs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_subs_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,25,-32768,27,32767,29,-60,31,64));
+
 __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epi16
   // CHECK: @llvm.ssub.sat.v16i16
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_subs_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-60,0,64));
+
 __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.sse2.psubus.b
@@ -1789,6 +1832,8 @@ __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_subs_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_mask_subs_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,57,0,59,1,61,1,63,0));
+
 __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epu8
   // CHECK-NOT: @llvm.x86.sse2.psubus.b
@@ -1796,6 +1841,8 @@ __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_subs_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16qu(_mm_maskz_subs_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
+
 __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epu8
   // CHECK-NOT: @llvm.x86.avx2.psubus.b
@@ -1803,6 +1850,8 @@ __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_subs_epu8(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_mask_subs_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0));
+
 __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epu8
   // CHECK-NOT: @llvm.x86.avx2.psubus.b
@@ -1810,6 +1859,8 @@ __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_subs_epu8(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32qu(_mm256_maskz_subs_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0));
+
 __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.sse2.psubus.w
@@ -1817,6 +1868,8 @@ __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_subs_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_mask_subs_epu16((__m128i)(__v8hu){101,102,103,104,129,130,131,132},(__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),101,60000,103,0,129,1,131,25000));
+
 __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.sse2.psubus.w
@@ -1824,6 +1877,8 @@ __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_subs_epu16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v8hu(_mm_maskz_subs_epu16((__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),0,60000,0,0,0,1,0,25000));
+
 __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_subs_epu16
   // CHECK-NOT: @llvm.x86.avx2.psubus.w
@@ -1831,6 +1886,8 @@ __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_subs_epu16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16hu(_mm256_mask_subs_epu16((__m256i)(__v16hu){101,102,103,104,105,106,107,108,125,126,127,128,129,130,131,132},(__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,125,0,127,0,129,1,131,25000));
+
 __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_subs_epu16
   // CHECK-NOT: @llvm.x86.avx2.psubus.w
@@ -1838,7 +1895,7 @@ __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_subs_epu16(__U,__A,__B); 
 }
-
+TEST_CONSTEXPR(match_v16hu(_mm256_maskz_subs_epu16((__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,10,65535,0,0,0,1000,0,1,10000,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000));
 
 __m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) {
   // CHECK-LABEL: test_mm_mask2_permutex2var_epi16
@@ -2233,6 +2290,7 @@ __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_unpackhi_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,114,-15,115,-16));
 
 __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_epi8
@@ -2240,6 +2298,7 @@ __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpackhi_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpackhi_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,114,-15,115,-16));
 
 __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpackhi_epi8
@@ -2247,6 +2306,7 @@ __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, _
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpackhi_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64));
 
 __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpackhi_epi8
@@ -2254,6 +2314,7 @@ __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpackhi_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpackhi_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64));
 
 __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_unpackhi_epi16
@@ -2261,6 +2322,7 @@ __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_unpackhi_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,204,3,205,106,206,107,207));
 
 __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_epi16
@@ -2268,6 +2330,7 @@ __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpackhi_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpackhi_epi16((__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,204,0,205,106,206,107,207));
 
 __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpackhi_epi16
@@ -2275,6 +2338,7 @@ __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpackhi_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,25,234,27,235,136,236,137,237));
 
 __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpackhi_epi16
@@ -2282,6 +2346,7 @@ __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B)
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpackhi_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpackhi_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,234,0,235,136,236,137,237));
 
 __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_unpacklo_epi8
@@ -2289,6 +2354,7 @@ __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_unpacklo_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,106,-7,107,-8));
 
 __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpacklo_epi8
@@ -2296,6 +2362,7 @@ __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpacklo_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpacklo_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,106,-7,107,-8));
 
 __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpacklo_epi8
@@ -2303,6 +2370,7 @@ __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, _
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpacklo_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67));
 
 __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpacklo_epi8
@@ -2310,6 +2378,7 @@ __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpacklo_epi8(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpacklo_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67));
 
 __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_unpacklo_epi16
@@ -2317,6 +2386,7 @@ __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_unpacklo_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,200,3,201,102,202,103,203));
 
 __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpacklo_epi16
@@ -2324,6 +2394,7 @@ __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpacklo_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpacklo_epi16((__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,200,0,201,102,202,103,203));
 
 __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpacklo_epi16
@@ -2331,6 +2402,7 @@ __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A,
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpacklo_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,25,230,27,231,132,232,133,233));
 
 __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_unpacklo_epi16
@@ -2338,6 +2410,7 @@ __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B)
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpacklo_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpacklo_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,230,0,231,132,232,133,233));
 
 __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi8_epi16
@@ -2345,6 +2418,7 @@ __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_cvtepi8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepi8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,-777,3,-777,-777,-6,-777,-8));
 
 __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_cvtepi8_epi16
@@ -2352,6 +2426,7 @@ __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_cvtepi8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepi8_epi16((__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,0,3,0,0,-6,0,-8));
 
 __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi8_epi16
@@ -2359,6 +2434,7 @@ __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_cvtepi8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepi8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,-777,3,-777,-777,-6,-777,-8,-777,-777,27,-28,29,-777,-777,-32));
 
 __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_cvtepi8_epi16
@@ -2366,6 +2442,7 @@ __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_cvtepi8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepi8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,0,3,0,0,-6,0,-8,0,0,27,-28,29,0,0,-32));
 
 __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepu8_epi16
@@ -2373,6 +2450,7 @@ __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_cvtepu8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepu8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,-777,27,-777,-777,30,-777,32));
 
 __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_cvtepu8_epi16
@@ -2380,6 +2458,7 @@ __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_cvtepu8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepu8_epi16((__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,0,27,0,0,30,0,32));
 
 __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepu8_epi16
@@ -2387,6 +2466,7 @@ __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_cvtepu8_epi16(__W, __U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepu8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,-777,3,-777,-777,6,-777,8,-777,-777,27,28,29,-777,-777,32));
 
 __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_cvtepu8_epi16
@@ -2394,6 +2474,7 @@ __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_cvtepu8_epi16(__U, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepu8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,0,3,0,0,6,0,8,0,0,27,28,29,0,0,32));
 
 __m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_sllv_epi16
@@ -2468,6 +2549,7 @@ __m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_sll_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_slli_epi16((__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_slli_epi16
@@ -2475,6 +2557,7 @@ __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_slli_epi16(__W, __U, __A, 5); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_slli_epi16((__m128i)(__v8hi){100, 101, 102, 103, 104, 105, 106, 107}, (__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 100, 0, 102, 0, 104, 0, 106, 0));
 
 __m128i test_mm_mask_slli_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) {
   // CHECK-LABEL: test_mm_mask_slli_epi16_2
@@ -3152,6 +3235,7 @@ __m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_broadcastb_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_broadcastb_epi8((__m128i)(__v16qs){0,1,2,3,4,5,6,7,56,57,58,59,60,61,62,63},(__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,56,-120,58,-120,60,-120,62,-120));
 
 __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_broadcastb_epi8
@@ -3159,6 +3243,7 @@ __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_broadcastb_epi8(__M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_broadcastb_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_broadcastb_epi8
@@ -3166,6 +3251,7 @@ __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_broadcastb_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_broadcastb_epi8((__m256i)(__v32qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120));
 
 __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcastb_epi8
@@ -3173,6 +3259,7 @@ __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_broadcastb_epi8(__M, __A);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_broadcastb_epi8((__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
 
 __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_broadcastw_epi16
@@ -3180,6 +3267,7 @@ __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_broadcastw_epi16(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_broadcastw_epi16((__m128i)(__v8hi){0,1,2,3,4,5,6,7},(__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120));
 
 __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_broadcastw_epi16
@@ -3187,6 +3275,7 @@ __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_broadcastw_epi16(__M, __A);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_broadcastw_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120));
 
 __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_mask_broadcastw_epi16
@@ -3194,6 +3283,7 @@ __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_broadcastw_epi16(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_broadcastw_epi16((__m256i)(__v16hi){0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31},(__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,24,-120,26,-120,28,-120,30,-120));
 
 __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcastw_epi16
@@ -3201,6 +3291,8 @@ __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_broadcastw_epi16(__M, __A);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_broadcastw_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120));
+
 __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
   // CHECK-LABEL: test_mm_mask_set1_epi8
   // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
@@ -3222,6 +3314,8 @@ __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_set1_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_mask_set1_epi8((__m128i)(__v16qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42));
+
 __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
   // CHECK-LABEL: test_mm_maskz_set1_epi8
   // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
@@ -3243,6 +3337,7 @@ __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_set1_epi8( __M, __A);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_set1_epi8((__mmask16)0xAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
 
 __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
   // CHECK-LABEL: test_mm256_mask_set1_epi8
@@ -3281,6 +3376,7 @@ __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) {
   // CHECK:  select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_set1_epi8(__O, __M, __A);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_set1_epi8((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42));
 
 __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi8
@@ -3319,7 +3415,7 @@ __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) {
   // CHECK:  select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_set1_epi8( __M, __A);
 }
-
+TEST_CONSTEXPR( match_v32qi( _mm256_maskz_set1_epi8( (__mmask32)0xAAAAAAAA, (char)42 ), 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42, 0,   42,  0, 42, 0,  42,  0,  42 ) );
 
 __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
   // CHECK-LABEL: test_mm256_mask_set1_epi16
@@ -3342,6 +3438,7 @@ __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
   // CHECK:  select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_set1_epi16(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_set1_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,42),1,42,3,42,5,42,7,42,25,42,27,42,29,42,31,42));
 
 __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi16
@@ -3364,6 +3461,7 @@ __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
   // CHECK:  select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_set1_epi16(__M, __A); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_set1_epi16((__mmask16)0xAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42));
 
 __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
   // CHECK-LABEL: test_mm_mask_set1_epi16
@@ -3378,6 +3476,7 @@ __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_set1_epi16(__O, __M, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_set1_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xAA,42),1,42,3,42,5,42,7,42));
 
 __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
   // CHECK-LABEL: test_mm_maskz_set1_epi16
@@ -3392,6 +3491,8 @@ __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_set1_epi16(__M, __A); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_set1_epi16((__mmask8)0xAA,42),0,42,0,42,0,42,0,42));
+
 __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_permutexvar_epi16
   // CHECK: @llvm.x86.avx512.permvar.hi.128

From 94a70064453773a7890eb62f63d3635109df1754 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com>
Date: Mon, 10 Nov 2025 17:09:02 +0200
Subject: [PATCH 14/28] [MLIR][XeGPU][TransformOps] Add set_op_layout_attr op
 (#166854)

Adds `transform.xegpu.set_op_layout_attr` transform op that attaches
`xegpu.layout` attribute to the target op.
---
 .../XeGPU/TransformOps/XeGPUTransformOps.td   |  65 +++++++++
 .../XeGPU/TransformOps/XeGPUTransformOps.cpp  | 121 +++++++++++++---
 mlir/python/mlir/dialects/transform/xegpu.py  |  47 ++++++
 .../Dialect/XeGPU/transform-ops-invalid.mlir  |  58 ++++++++
 mlir/test/Dialect/XeGPU/transform-ops.mlir    | 134 ++++++++++++++++++
 .../python/dialects/transform_xegpu_ext.py    |  49 +++++++
 6 files changed, 455 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
index ed277ef7bd554..34f333e556deb 100644
--- a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td
@@ -96,4 +96,69 @@ def SetDescLayoutOp : Op<Transform_Dialect, "xegpu.set_desc_layout", [
   }];
 }
 
+def SetOpLayoutAttrOp : Op<Transform_Dialect, "xegpu.set_op_layout_attr", [
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  TransformOpInterface
+]> {
+
+  let summary = "Set xegpu.layout attribute of an op.";
+  let description = [{
+    Sets the `xegpu.layout` attribute of an op. If `result=true`, sets the
+    `layout_result_{index}`, otherwise `layout_operand_{index}` attribute. The
+    target operand/result value is defined by the `index` argument. The layout
+    is defined by the `sg_layout`, `sg_data` and optional `inst_data` attributes.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                   DefaultValuedOptionalAttr<I64Attr, "0">:$index,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_layout,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$sg_data,
+                   Variadic<TransformAnyParamTypeOrAnyHandle>:$inst_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_layout,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_sg_data,
+                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_inst_data,
+                   DefaultValuedAttr<UnitAttr, "false">:$result
+                   );
+
+  let results = (outs);
+  let builders = [
+    OpBuilder<(ins "Value":$target,
+                   "int64_t":$index,
+                   "ArrayRef<OpFoldResult>":$mixedSgLayout,
+                   "ArrayRef<OpFoldResult>":$mixedSgData,
+                   "ArrayRef<OpFoldResult>":$mixedInstData,
+                   CArg<"bool", "false">:$result
+                   )>,
+  ];
+
+  let assemblyFormat = [{
+    $target (`result` $result^)? (`index` `=` $index^)?
+    `sg_layout` `=` custom<DynamicIndexList>($sg_layout, $static_sg_layout)
+    `sg_data` `=` custom<DynamicIndexList>($sg_data, $static_sg_data)
+    (`inst_data` `=` custom<DynamicIndexList>($inst_data, $static_inst_data)^)?
+    attr-dict `:` qualified(type(operands))
+  }];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure apply(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::transform::TransformResults &transformResults,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgLayout() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgLayout(), getSgLayout(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticSgData(), getSgData(), b);
+    }
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInstData() {
+      Builder b(getContext());
+      return getMixedValues(getStaticInstData(), getInstData(), b);
+    }
+  }];
+}
+
 #endif // XEGPU_TRANSFORM_OPS
diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
index 0683699f467e9..5fdd8534e4e51 100644
--- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp
@@ -132,6 +132,36 @@ createLayoutAttr(MLIRContext *ctx, ArrayRef<int32_t> sgLayout,
       /*order=*/nullptr);
 }
 
+/// Generate `xegpu::LayoutAttr` from op mixed layout values.
+DiagnosedSilenceableFailure
+getLayoutAttrFromOperands(MLIRContext *ctx, transform::TransformState &state,
+                          TransformOpInterface transformOp,
+                          ArrayRef<::mlir::OpFoldResult> mixedSgLayout,
+                          ArrayRef<::mlir::OpFoldResult> mixedSgData,
+                          ArrayRef<::mlir::OpFoldResult> mixedInstData,
+                          xegpu::LayoutAttr &layoutAttr) {
+  SmallVector<int32_t> sgLayout, sgData, instData;
+  auto status =
+      convertMixedValuesToInt(state, transformOp, sgLayout, mixedSgLayout);
+  if (!status.succeeded())
+    return status;
+
+  status = convertMixedValuesToInt(state, transformOp, sgData, mixedSgData);
+  if (!status.succeeded())
+    return status;
+
+  status = convertMixedValuesToInt(state, transformOp, instData, mixedInstData);
+  if (!status.succeeded())
+    return status;
+  auto maybeInstData = instData.empty()
+                           ? std::nullopt
+                           : std::optional<ArrayRef<int32_t>>(instData);
+
+  layoutAttr = createLayoutAttr(ctx, sgLayout, sgData, maybeInstData);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
 /// Replace xegpu.create_nd_desc op with a new one with the given layout.
 static xegpu::CreateNdDescOp
 setDescLayout(transform::TransformRewriter &rewriter,
@@ -207,26 +237,13 @@ transform::SetDescLayoutOp::apply(transform::TransformRewriter &rewriter,
   }
   Operation *target = *targetOps.begin();
 
-  SmallVector<int32_t> sgLayout;
-  DiagnosedSilenceableFailure status =
-      convertMixedValuesToInt(state, (*this), sgLayout, getMixedSgLayout());
+  xegpu::LayoutAttr layoutAttr = nullptr;
+  auto status = getLayoutAttrFromOperands(getContext(), state, (*this),
+                                          getMixedSgLayout(), getMixedSgData(),
+                                          getMixedInstData(), layoutAttr);
   if (!status.succeeded())
     return status;
 
-  SmallVector<int32_t> sgData;
-  status = convertMixedValuesToInt(state, (*this), sgData, getMixedSgData());
-  if (!status.succeeded())
-    return status;
-
-  SmallVector<int32_t> instData;
-  status =
-      convertMixedValuesToInt(state, (*this), instData, getMixedInstData());
-  if (!status.succeeded())
-    return status;
-  auto maybeInstData = instData.empty()
-                           ? std::nullopt
-                           : std::optional<ArrayRef<int32_t>>(instData);
-
   // For now only create_nd_desc op is supported.
   auto descOp = dyn_cast<xegpu::CreateNdDescOp>(target);
   if (!descOp) {
@@ -238,8 +255,6 @@ transform::SetDescLayoutOp::apply(transform::TransformRewriter &rewriter,
   }
 
   // Set layout attr in desc op's return type. Replaces old desc op.
-  auto layoutAttr =
-      createLayoutAttr(rewriter.getContext(), sgLayout, sgData, maybeInstData);
   auto newdescOp = setDescLayout(rewriter, descOp, layoutAttr);
 
   // Map result handles.
@@ -258,6 +273,74 @@ void transform::SetDescLayoutOp::getEffects(
   modifiesPayload(effects);
 }
 
+void transform::SetOpLayoutAttrOp::build(
+    OpBuilder &builder, OperationState &ostate, Value target, int64_t index,
+    ArrayRef<OpFoldResult> mixedSgLayout, ArrayRef<OpFoldResult> mixedSgData,
+    ArrayRef<OpFoldResult> mixedInstData, bool result) {
+  SmallVector<int64_t> staticSgLayout, staticSgData, staticInstData;
+  SmallVector<Value> dynamicSgLayout, dynamicSgData, dynamicInstData;
+  dispatchIndexOpFoldResults(mixedSgLayout, dynamicSgLayout, staticSgLayout);
+  dispatchIndexOpFoldResults(mixedSgData, dynamicSgData, staticSgData);
+  dispatchIndexOpFoldResults(mixedInstData, dynamicInstData, staticInstData);
+  build(builder, ostate, target.getType(),
+        /*target=*/target,
+        /*index=*/index,
+        /*sg_layout=*/dynamicSgLayout,
+        /*sg_data=*/dynamicSgData,
+        /*inst_data=*/dynamicInstData,
+        /*static_sg_layout=*/staticSgLayout,
+        /*static_sg_data=*/staticSgData,
+        /*static_inst_data=*/staticInstData,
+        /*result=*/result);
+}
+
+DiagnosedSilenceableFailure
+transform::SetOpLayoutAttrOp::apply(transform::TransformRewriter &rewriter,
+                                    transform::TransformResults &results,
+                                    transform::TransformState &state) {
+  auto targetOps = state.getPayloadOps(getTarget());
+  if (!llvm::hasSingleElement(targetOps)) {
+    return emitDefiniteFailure() << "Requires exactly one targetOp handle (got "
+                                 << llvm::range_size(targetOps) << ")";
+  }
+  Operation *target = *targetOps.begin();
+
+  bool resultTarget = getResult();
+
+  int64_t index = getIndex();
+  if (resultTarget && index >= target->getNumResults()) {
+    return emitSilenceableFailure(getLoc())
+           << "Index exceeds the number of op results";
+  }
+  if (!resultTarget && index >= target->getNumOperands()) {
+    return emitSilenceableFailure(getLoc())
+           << "Index exceeds the number of op operands";
+  }
+
+  xegpu::LayoutAttr layoutAttr = nullptr;
+  auto status = getLayoutAttrFromOperands(getContext(), state, (*this),
+                                          getMixedSgLayout(), getMixedSgData(),
+                                          getMixedInstData(), layoutAttr);
+  if (!status.succeeded())
+    return status;
+
+  // Set layout attribute for the op result or operand
+  if (resultTarget)
+    xegpu::setDistributeLayoutAttr(target->getResult(index), layoutAttr);
+  else
+    xegpu::setDistributeLayoutAttr(target->getOpOperand(index), layoutAttr);
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::SetOpLayoutAttrOp::getEffects(
+    ::llvm::SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getTargetMutable(), effects);
+  onlyReadsHandle(getSgLayoutMutable(), effects);
+  onlyReadsHandle(getSgDataMutable(), effects);
+  onlyReadsHandle(getInstDataMutable(), effects);
+  modifiesPayload(effects);
+}
+
 namespace {
 class XeGPUTransformDialectExtension
     : public transform::TransformDialectExtension<
diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py
index d23f2ac16429f..ce8015d8f557b 100644
--- a/mlir/python/mlir/dialects/transform/xegpu.py
+++ b/mlir/python/mlir/dialects/transform/xegpu.py
@@ -85,3 +85,50 @@ def __init__(
             loc=loc,
             ip=ip,
         )
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class SetOpLayoutAttrOp(SetOpLayoutAttrOp):
+    """Specialization for SetOpLayoutAttrOp class."""
+
+    def __init__(
+        self,
+        target: Union[Operation, Value],
+        sg_layout: MixedValues,
+        sg_data: MixedValues,
+        *,
+        inst_data: Optional[MixedValues] = None,
+        index: Optional[Union[int, Attribute]] = None,
+        result: Optional[Union[bool, Attribute]] = None,
+        loc=None,
+        ip=None,
+    ):
+        inst_data = [] if inst_data is None else inst_data
+        (
+            dynamic_sg_layout,
+            static_sg_layout,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_layout)
+        (
+            dynamic_sg_data,
+            static_sg_data,
+            _,
+        ) = _dispatch_dynamic_index_list(sg_data)
+        (
+            dynamic_inst_data,
+            static_inst_data,
+            _,
+        ) = _dispatch_dynamic_index_list(inst_data)
+        super().__init__(
+            _get_op_result_or_value(target),
+            dynamic_sg_layout,
+            dynamic_sg_data,
+            dynamic_inst_data,
+            static_sg_layout=static_sg_layout,
+            static_sg_data=static_sg_data,
+            static_inst_data=static_inst_data,
+            index=index,
+            result=result,
+            loc=loc,
+            ip=ip,
+        )
diff --git a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
index 303584518f9f4..726b6748452ae 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir
@@ -13,3 +13,61 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_bad_result_index
+func.func @set_op_layout_attr_bad_result_index(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Index exceeds the number of op results}}
+    transform.xegpu.set_op_layout_attr %0 result index = 1 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_bad_operand_index
+func.func @set_op_layout_attr_bad_operand_index(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Index exceeds the number of op operands}}
+    transform.xegpu.set_op_layout_attr %0 index = 1 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_multiple
+func.func @set_op_layout_attr_multiple(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  %3 = arith.extf %2 : vector<256x32xf32> to vector<256x32xf64>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error@below {{Requires exactly one targetOp handle (got 2)}}
+    transform.xegpu.set_op_layout_attr %0 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir
index 342de429d2e90..bd6a79244ed30 100644
--- a/mlir/test/Dialect/XeGPU/transform-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir
@@ -118,3 +118,137 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result_default_index
+func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16>
+  %3 = xegpu.load_nd %2[0, 0]  : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16>
+  %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16>
+  %5 = xegpu.load_nd %4[0, 0]  : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16>
+  // CHECK: = xegpu.dpas
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 result sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result_sg_param
+func.func @set_op_layout_attr_result_sg_param(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    transform.xegpu.set_op_layout_attr %0 result sg_layout = [%layout0, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op, !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result_sg_param2
+func.func @set_op_layout_attr_result_sg_param2(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    %layout0 = transform.param.constant 8 : i64 -> !transform.param<i64>
+    %layout1 = transform.param.constant 4 : i64 -> !transform.param<i64>
+    transform.xegpu.set_op_layout_attr %0 result sg_layout = [%layout0, %layout1] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op, !transform.param<i64>, !transform.param<i64>
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_result0
+func.func @set_op_layout_attr_result0(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 result index = 0 sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_operand_minimal
+func.func @set_op_layout_attr_operand_minimal(%arg0: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.extf %1
+  // CHECK-SAME: {layout_operand_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64]>}
+  %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op
+    transform.yield
+  }
+}
+// -----
+
+// CHECK-LABEL: @set_op_layout_attr_operand1
+func.func @set_op_layout_attr_operand1(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>) {
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %1 = xegpu.load_nd %0[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16>
+  %3 = xegpu.load_nd %2[0, 0]  : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16>
+  // CHECK: = arith.addf %1, %3
+  // CHECK-SAME: {layout_operand_1 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>}
+  %6 = arith.addf %1, %3 : vector<256x32xf16>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["arith.addf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // CHECK: transform.xegpu.set_op_layout_attr %{{.*}}
+    transform.xegpu.set_op_layout_attr %0 index = 1 sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py
index f83c807f571e1..0b587d2020aa6 100644
--- a/mlir/test/python/dialects/transform_xegpu_ext.py
+++ b/mlir/test/python/dialects/transform_xegpu_ext.py
@@ -64,3 +64,52 @@ def setDescLayoutInstData():
     # CHECK: sg_layout = [6, 4]
     # CHECK: sg_data = [32, 16]
     # CHECK: inst_data = [8, 16]
+
+
+@run
+def setOpLayoutAttrOperandMinimal():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.SetOpLayoutAttrOp(
+            sequence.bodyTarget,
+            sg_layout=[6, 4],
+            sg_data=[32, 16],
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setOpLayoutAttr
+    # CHECK: transform.xegpu.set_op_layout_attr %
+    # NO-CHECK: index = 0
+    # NO-CHECK: result
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+    # NO-CHECK: inst_data
+
+
+@run
+def setOpLayoutAttrResult():
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate,
+        [],
+        transform.OperationType.get("xegpu.dpas"),
+    )
+    with InsertionPoint(sequence.body):
+        xegpu.SetOpLayoutAttrOp(
+            sequence.bodyTarget,
+            index=0,
+            sg_layout=[6, 4],
+            sg_data=[32, 16],
+            inst_data=[8, 16],
+            result=True,
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: setOpLayoutAttr
+    # CHECK: transform.xegpu.set_op_layout_attr %
+    # NO-CHECK: index = 0
+    # CHECK: result
+    # CHECK: sg_layout = [6, 4]
+    # CHECK: sg_data = [32, 16]
+    # CHECK: inst_data = [8, 16]

From 69c8756af0734c78542f09996dc6101792474324 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Mon, 10 Nov 2025 10:12:00 -0500
Subject: [PATCH 15/28] [NFC][PowerPC] Pre-commit adding test case: use
 millicode for memmove (#166961)

add test case to test lib call are used for the memmove milicode
---
 llvm/test/CodeGen/PowerPC/milicode32.ll | 56 ++++++++++++++++++
 llvm/test/CodeGen/PowerPC/milicode64.ll | 79 +++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll
index 78d036202fe4e..ddadd01a748f1 100644
--- a/llvm/test/CodeGen/PowerPC/milicode32.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode32.ll
@@ -69,3 +69,59 @@ entry:
 }
 
 declare i32 @strlen(ptr noundef) nounwind
+
+define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 {
+; CHECK-AIX-32-P9-LABEL: test_memmove:
+; CHECK-AIX-32-P9:       # %bb.0: # %entry
+; CHECK-AIX-32-P9-NEXT:    mflr r0
+; CHECK-AIX-32-P9-NEXT:    stwu r1, -80(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r0, 88(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r31, 76(r1) # 4-byte Folded Spill
+; CHECK-AIX-32-P9-NEXT:    mr r31, r3
+; CHECK-AIX-32-P9-NEXT:    stw r3, 72(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r4, 68(r1)
+; CHECK-AIX-32-P9-NEXT:    stw r5, 64(r1)
+; CHECK-AIX-32-P9-NEXT:    bl .___memmove[PR]
+; CHECK-AIX-32-P9-NEXT:    nop
+; CHECK-AIX-32-P9-NEXT:    mr r3, r31
+; CHECK-AIX-32-P9-NEXT:    lwz r31, 76(r1) # 4-byte Folded Reload
+; CHECK-AIX-32-P9-NEXT:    addi r1, r1, 80
+; CHECK-AIX-32-P9-NEXT:    lwz r0, 8(r1)
+; CHECK-AIX-32-P9-NEXT:    mtlr r0
+; CHECK-AIX-32-P9-NEXT:    blr
+;
+; CHECK-LINUX32-P9-LABEL: test_memmove:
+; CHECK-LINUX32-P9:       # %bb.0: # %entry
+; CHECK-LINUX32-P9-NEXT:    mflr r0
+; CHECK-LINUX32-P9-NEXT:    stwu r1, -32(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r0, 36(r1)
+; CHECK-LINUX32-P9-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-LINUX32-P9-NEXT:    .cfi_offset lr, 4
+; CHECK-LINUX32-P9-NEXT:    .cfi_offset r30, -8
+; CHECK-LINUX32-P9-NEXT:    stw r30, 24(r1) # 4-byte Folded Spill
+; CHECK-LINUX32-P9-NEXT:    mr r30, r3
+; CHECK-LINUX32-P9-NEXT:    stw r3, 20(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r4, 16(r1)
+; CHECK-LINUX32-P9-NEXT:    stw r5, 12(r1)
+; CHECK-LINUX32-P9-NEXT:    bl memmove
+; CHECK-LINUX32-P9-NEXT:    mr r3, r30
+; CHECK-LINUX32-P9-NEXT:    lwz r30, 24(r1) # 4-byte Folded Reload
+; CHECK-LINUX32-P9-NEXT:    lwz r0, 36(r1)
+; CHECK-LINUX32-P9-NEXT:    addi r1, r1, 32
+; CHECK-LINUX32-P9-NEXT:    mtlr r0
+; CHECK-LINUX32-P9-NEXT:    blr
+entry:
+  %destination.addr = alloca ptr, align 4
+  %source.addr = alloca ptr, align 4
+  %num.addr = alloca i32, align 4
+  store ptr %destination, ptr %destination.addr, align 4
+  store ptr %source, ptr %source.addr, align 4
+  store i32 %num, ptr %num.addr, align 4
+  %0 = load ptr, ptr %destination.addr, align 4
+  %1 = load ptr, ptr %source.addr, align 4
+  %2 = load i32, ptr %num.addr, align 4
+  call void @llvm.memmove.p0.p0.i32(ptr align 1 %0, ptr align 1 %1, i32 %2, i1 false)
+  ret ptr %0
+}
+
+declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/milicode64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll
index 8b87529d9a6d8..f7814a424e0b9 100644
--- a/llvm/test/CodeGen/PowerPC/milicode64.ll
+++ b/llvm/test/CodeGen/PowerPC/milicode64.ll
@@ -100,3 +100,82 @@ entry:
 }
 
 declare i64 @strlen(ptr noundef) nounwind
+
+define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i64 noundef %num) #0 {
+; CHECK-LE-P9-LABEL: test_memmove:
+; CHECK-LE-P9:       # %bb.0: # %entry
+; CHECK-LE-P9-NEXT:    mflr r0
+; CHECK-LE-P9-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-LE-P9-NEXT:    .cfi_offset lr, 16
+; CHECK-LE-P9-NEXT:    .cfi_offset r30, -16
+; CHECK-LE-P9-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-LE-P9-NEXT:    stdu r1, -80(r1)
+; CHECK-LE-P9-NEXT:    std r0, 96(r1)
+; CHECK-LE-P9-NEXT:    mr r30, r3
+; CHECK-LE-P9-NEXT:    std r3, 56(r1)
+; CHECK-LE-P9-NEXT:    std r4, 48(r1)
+; CHECK-LE-P9-NEXT:    std r5, 40(r1)
+; CHECK-LE-P9-NEXT:    bl memmove
+; CHECK-LE-P9-NEXT:    nop
+; CHECK-LE-P9-NEXT:    mr r3, r30
+; CHECK-LE-P9-NEXT:    addi r1, r1, 80
+; CHECK-LE-P9-NEXT:    ld r0, 16(r1)
+; CHECK-LE-P9-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-LE-P9-NEXT:    mtlr r0
+; CHECK-LE-P9-NEXT:    blr
+;
+; CHECK-BE-P9-LABEL: test_memmove:
+; CHECK-BE-P9:       # %bb.0: # %entry
+; CHECK-BE-P9-NEXT:    mflr r0
+; CHECK-BE-P9-NEXT:    stdu r1, -160(r1)
+; CHECK-BE-P9-NEXT:    std r0, 176(r1)
+; CHECK-BE-P9-NEXT:    .cfi_def_cfa_offset 160
+; CHECK-BE-P9-NEXT:    .cfi_offset lr, 16
+; CHECK-BE-P9-NEXT:    .cfi_offset r30, -16
+; CHECK-BE-P9-NEXT:    std r30, 144(r1) # 8-byte Folded Spill
+; CHECK-BE-P9-NEXT:    mr r30, r3
+; CHECK-BE-P9-NEXT:    std r3, 136(r1)
+; CHECK-BE-P9-NEXT:    std r4, 128(r1)
+; CHECK-BE-P9-NEXT:    std r5, 120(r1)
+; CHECK-BE-P9-NEXT:    bl memmove
+; CHECK-BE-P9-NEXT:    nop
+; CHECK-BE-P9-NEXT:    mr r3, r30
+; CHECK-BE-P9-NEXT:    ld r30, 144(r1) # 8-byte Folded Reload
+; CHECK-BE-P9-NEXT:    addi r1, r1, 160
+; CHECK-BE-P9-NEXT:    ld r0, 16(r1)
+; CHECK-BE-P9-NEXT:    mtlr r0
+; CHECK-BE-P9-NEXT:    blr
+;
+; CHECK-AIX-64-P9-LABEL: test_memmove:
+; CHECK-AIX-64-P9:       # %bb.0: # %entry
+; CHECK-AIX-64-P9-NEXT:    mflr r0
+; CHECK-AIX-64-P9-NEXT:    stdu r1, -144(r1)
+; CHECK-AIX-64-P9-NEXT:    std r0, 160(r1)
+; CHECK-AIX-64-P9-NEXT:    std r31, 136(r1) # 8-byte Folded Spill
+; CHECK-AIX-64-P9-NEXT:    mr r31, r3
+; CHECK-AIX-64-P9-NEXT:    std r3, 128(r1)
+; CHECK-AIX-64-P9-NEXT:    std r4, 120(r1)
+; CHECK-AIX-64-P9-NEXT:    std r5, 112(r1)
+; CHECK-AIX-64-P9-NEXT:    bl .memmove[PR]
+; CHECK-AIX-64-P9-NEXT:    nop
+; CHECK-AIX-64-P9-NEXT:    mr r3, r31
+; CHECK-AIX-64-P9-NEXT:    ld r31, 136(r1) # 8-byte Folded Reload
+; CHECK-AIX-64-P9-NEXT:    addi r1, r1, 144
+; CHECK-AIX-64-P9-NEXT:    ld r0, 16(r1)
+; CHECK-AIX-64-P9-NEXT:    mtlr r0
+; CHECK-AIX-64-P9-NEXT:    blr
+entry:
+  %destination.addr = alloca ptr, align 8
+  %source.addr = alloca ptr, align 8
+  %num.addr = alloca i64, align 8
+  store ptr %destination, ptr %destination.addr, align 8
+  store ptr %source, ptr %source.addr, align 8
+  store i64 %num, ptr %num.addr, align 8
+  %0 = load ptr, ptr %destination.addr, align 8
+  %1 = load ptr, ptr %source.addr, align 8
+  %2 = load i64, ptr %num.addr, align 8
+  call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 %2, i1 false)
+  ret ptr %0
+}
+
+declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg)

From c17a839d8335ae75447221adf62f7993e575a913 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Tue, 11 Nov 2025 00:12:13 +0900
Subject: [PATCH 16/28] [OMPIRBuilder] Fix addrspace of internal critical
 section lock (#166459)

First, for internal variables, they are always global, so use the global
AS by default unless specified otherwise. We can't really use `0` as a
default like we do now because that has an actual meaning on some
targets, so we really need specified vs unspecified, so I used
`std::optional` which is already used in many places in OMPIRBuilder.

Second, for the critical lock variable, add an addrspace cast if needed.

Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 25 ++++++++++++-------
 clang/test/OpenMP/force-usm.c                 |  2 +-
 .../OpenMP/spirv_target_codegen_basic.cpp     |  6 +++++
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 13 +++++-----
 5 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 121de42248e3b..44ba72c5c76c7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2000,22 +2000,29 @@ void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
   // Prepare arguments and build a call to __kmpc_critical
   if (!CGF.HaveInsertPoint())
     return;
+  llvm::FunctionCallee RuntimeFcn = OMPBuilder.getOrCreateRuntimeFunction(
+      CGM.getModule(),
+      Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical);
+  llvm::Value *LockVar = getCriticalRegionLock(CriticalName);
+  unsigned LockVarArgIdx = 2;
+  if (cast<llvm::GlobalVariable>(LockVar)->getAddressSpace() !=
+      RuntimeFcn.getFunctionType()
+          ->getParamType(LockVarArgIdx)
+          ->getPointerAddressSpace())
+    LockVar = CGF.Builder.CreateAddrSpaceCast(
+        LockVar, RuntimeFcn.getFunctionType()->getParamType(LockVarArgIdx));
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
-                         getCriticalRegionLock(CriticalName)};
+                         LockVar};
   llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
                                                 std::end(Args));
   if (Hint) {
     EnterArgs.push_back(CGF.Builder.CreateIntCast(
         CGF.EmitScalarExpr(Hint), CGM.Int32Ty, /*isSigned=*/false));
   }
-  CommonActionTy Action(
-      OMPBuilder.getOrCreateRuntimeFunction(
-          CGM.getModule(),
-          Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical),
-      EnterArgs,
-      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
-                                            OMPRTL___kmpc_end_critical),
-      Args);
+  CommonActionTy Action(RuntimeFcn, EnterArgs,
+                        OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_end_critical),
+                        Args);
   CriticalOpGen.setAction(Action);
   emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
 }
diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c
index 5c63a9a5e7004..45c0e28b525da 100644
--- a/clang/test/OpenMP/force-usm.c
+++ b/clang/test/OpenMP/force-usm.c
@@ -46,7 +46,7 @@ int main(void) {
 // CHECK-USM-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-USM:       user_code.entry:
 // CHECK-USM-NEXT:    store i32 1, ptr [[TMP0]], align 4
-// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
+// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(1) @pGI_decl_tgt_ref_ptr, align 8
 // CHECK-USM-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK-USM-NEXT:    store i32 2, ptr [[TMP3]], align 4
 // CHECK-USM-NEXT:    call void @__kmpc_target_deinit()
diff --git a/clang/test/OpenMP/spirv_target_codegen_basic.cpp b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
index fb2810e88c063..6e029fb93644d 100644
--- a/clang/test/OpenMP/spirv_target_codegen_basic.cpp
+++ b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
@@ -6,12 +6,18 @@
 // CHECK: @__omp_offloading_{{.*}}_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
 // CHECK: @__omp_offloading_{{.*}}_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy
 
+// CHECK: @"_gomp_critical_user_$var" = common addrspace(1) global [8 x i32] zeroinitializer, align 8
+
 // CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}}
 
+// CHECK: call spir_func addrspace(9) void @__kmpc_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
+// CHECK: call spir_func addrspace(9) void @__kmpc_end_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
+
 int main() {
   int ret = 0;
   #pragma omp target
   for(int i = 0; i < 5; i++)
+    #pragma omp critical
     ret++;
   return ret;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b3d7ab4acf303..fd6b9729658c1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3654,7 +3654,7 @@ class OpenMPIRBuilder {
   /// \param Name Name of the variable.
   LLVM_ABI GlobalVariable *
   getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
-                              unsigned AddressSpace = 0);
+                              std::optional<unsigned> AddressSpace = {});
 };
 
 /// Class to represented the control flow structure of an OpenMP canonical loop.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index fff9a815e5368..7dc32fda0eed6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -8460,9 +8460,8 @@ OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
                                                 Config.separator());
 }
 
-GlobalVariable *
-OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
-                                             unsigned AddressSpace) {
+GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
+    Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
   auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
   if (Elem.second) {
     assert(Elem.second->getValueType() == Ty &&
@@ -8472,16 +8471,18 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
     // variable for possibly changing that to internal or private, or maybe
     // create different versions of the function for different OMP internal
     // variables.
+    const DataLayout &DL = M.getDataLayout();
+    unsigned AddressSpaceVal =
+        AddressSpace ? *AddressSpace : DL.getDefaultGlobalsAddressSpace();
     auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
                        ? GlobalValue::InternalLinkage
                        : GlobalValue::CommonLinkage;
     auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
                                   Constant::getNullValue(Ty), Elem.first(),
                                   /*InsertBefore=*/nullptr,
-                                  GlobalValue::NotThreadLocal, AddressSpace);
-    const DataLayout &DL = M.getDataLayout();
+                                  GlobalValue::NotThreadLocal, AddressSpaceVal);
     const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
-    const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
+    const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
     GV->setAlignment(std::max(TypeAlign, PtrAlign));
     Elem.second = GV;
   }

From 99a5e7b48d26077e9aa8c88d6c6f2a755f1225bc Mon Sep 17 00:00:00 2001
From: Andrei Golubev <andrey.golubev@intel.com>
Date: Mon, 10 Nov 2025 15:22:49 +0000
Subject: [PATCH 17/28] [mlir][NFC] Split registerAndParseCLIOptions() in
 mlir-opt (#166538)

mlir-opt's registerAndParseCLIOptions() forces users to both register
default MLIR options and parse the command line string. Custom mlir-opt
implementations, however, may need to provide own options or own
parsing. It seems that separating the two functions makes it easier to
achieve necessary customizations.

For example, one can register "default" options, then register custom
options (not available in standard mlir-opt), then parse all of them.
Other cases include two-stage parsing where some additional options
become available based on parsed information (e.g. compilation target
can allow additional options to be present).
---
 .../include/mlir/Tools/mlir-opt/MlirOptMain.h | 14 ++++++++
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       | 33 ++++++++++++-------
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index b7394387b0f9a..79dfd7a2795f0 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -359,6 +359,20 @@ class MlirOptMainConfig {
 /// the loaded IR.
 using PassPipelineFn = llvm::function_ref<LogicalResult(PassManager &pm)>;
 
+/// Register basic command line options.
+/// - toolName is used for the header displayed by `--help`.
+/// - registry should contain all the dialects that can be parsed in the source.
+/// - return std::string for help header.
+std::string registerCLIOptions(llvm::StringRef toolName,
+                               DialectRegistry &registry);
+
+/// Parse command line options.
+/// - helpHeader is used for the header displayed by `--help`.
+/// - return std::pair<std::string, std::string> for
+///   inputFilename and outputFilename command line option values.
+std::pair<std::string, std::string> parseCLIOptions(int argc, char **argv,
+                                                    llvm::StringRef helpHeader);
+
 /// Register and parse command line options.
 /// - toolName is used for the header displayed by `--help`.
 /// - registry should contain all the dialects that can be parsed in the source.
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 9ef405dad5a70..018a188d09109 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -681,17 +681,8 @@ processBuffer(raw_ostream &os, std::unique_ptr<MemoryBuffer> ownedBuffer,
   return success();
 }
 
-std::pair<std::string, std::string>
-mlir::registerAndParseCLIOptions(int argc, char **argv,
-                                 llvm::StringRef toolName,
-                                 DialectRegistry &registry) {
-  static cl::opt<std::string> inputFilename(
-      cl::Positional, cl::desc("<input file>"), cl::init("-"));
-
-  static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
-                                             cl::value_desc("filename"),
-                                             cl::init("-"));
-  // Register any command line options.
+std::string mlir::registerCLIOptions(llvm::StringRef toolName,
+                                     DialectRegistry &registry) {
   MlirOptMainConfig::registerCLOptions(registry);
   registerAsmPrinterCLOptions();
   registerMLIRContextCLOptions();
@@ -706,11 +697,29 @@ mlir::registerAndParseCLIOptions(int argc, char **argv,
     interleaveComma(registry.getDialectNames(), os,
                     [&](auto name) { os << name; });
   }
-  // Parse pass names in main to ensure static initialization completed.
+  return helpHeader;
+}
+
+std::pair<std::string, std::string>
+mlir::parseCLIOptions(int argc, char **argv, llvm::StringRef helpHeader) {
+  static cl::opt<std::string> inputFilename(
+      cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+  static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
+                                             cl::value_desc("filename"),
+                                             cl::init("-"));
   cl::ParseCommandLineOptions(argc, argv, helpHeader);
   return std::make_pair(inputFilename.getValue(), outputFilename.getValue());
 }
 
+std::pair<std::string, std::string>
+mlir::registerAndParseCLIOptions(int argc, char **argv,
+                                 llvm::StringRef toolName,
+                                 DialectRegistry &registry) {
+  auto helpHeader = registerCLIOptions(toolName, registry);
+  return parseCLIOptions(argc, argv, helpHeader);
+}
+
 static LogicalResult printRegisteredDialects(DialectRegistry &registry) {
   llvm::outs() << "Available Dialects: ";
   interleave(registry.getDialectNames(), llvm::outs(), ",");

From 2681497d1b3ddceabdb58edff932053270705f10 Mon Sep 17 00:00:00 2001
From: Marcos Maronas <marcos.maronas@intel.com>
Date: Mon, 10 Nov 2025 15:24:50 +0000
Subject: [PATCH 18/28] [SPIRV] Allow multiple FuncParamAttr decoration on the
 same id. (#166782)

According to SPIR-V spec:

> It is invalid to decorate any given id or structure member more than
one time with the same
[decoration](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#Decoration),
unless explicitly allowed below for a specific decoration.

`FuncParamAttr` explicitly allows multiple uses of the decoration on the
same id, so this patch honors it.
---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp   | 17 +++++++++--------
 .../SPIRV/spirv_param_decorations_quals.ll      |  4 +++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index fbb127df16dd9..b8cd9c1358f00 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -249,17 +249,18 @@ static InstrSignature instrToSignature(const MachineInstr &MI,
   InstrSignature Signature{MI.getOpcode()};
   for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
     // The only decorations that can be applied more than once to a given <id>
-    // or structure member are UserSemantic(5635), CacheControlLoadINTEL (6442),
-    // and CacheControlStoreINTEL (6443). For all the rest of decorations, we
-    // will only add to the signature the Opcode, the id to which it applies,
-    // and the decoration id, disregarding any decoration flags. This will
-    // ensure that any subsequent decoration with the same id will be deemed as
-    // a duplicate. Then, at the call site, we will be able to handle duplicates
-    // in the best way.
+    // or structure member are FuncParamAttr (38), UserSemantic (5635),
+    // CacheControlLoadINTEL (6442), and CacheControlStoreINTEL (6443). For all
+    // the rest of decorations, we will only add to the signature the Opcode,
+    // the id to which it applies, and the decoration id, disregarding any
+    // decoration flags. This will ensure that any subsequent decoration with
+    // the same id will be deemed as a duplicate. Then, at the call site, we
+    // will be able to handle duplicates in the best way.
     unsigned Opcode = MI.getOpcode();
     if ((Opcode == SPIRV::OpDecorate) && i >= 2) {
       unsigned DecorationID = MI.getOperand(1).getImm();
-      if (DecorationID != SPIRV::Decoration::UserSemantic &&
+      if (DecorationID != SPIRV::Decoration::FuncParamAttr &&
+          DecorationID != SPIRV::Decoration::UserSemantic &&
           DecorationID != SPIRV::Decoration::CacheControlLoadINTEL &&
           DecorationID != SPIRV::Decoration::CacheControlStoreINTEL)
         continue;
diff --git a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll
index 260394b658348..fb550bb01a3a2 100644
--- a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll
+++ b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll
@@ -7,9 +7,11 @@ entry:
 
 ; CHECK-SPIRV: OpDecorate %[[#PId:]] Volatile
 ; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoAlias
+; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoWrite
 ; CHECK-SPIRV: %[[#PId]] = OpFunctionParameter %[[#]]
 
 !7 = !{!"volatile"}
 !8 = !{i32 38, i32 4} ; FuncParamAttr NoAlias
-!9 = !{!8}
+!11 = !{i32 38, i32 6} ; FuncParamAttr NoWrite
+!9 = !{!8, !11}
 !10 = !{!9}

From 2d381bf65dd436c4ec70b008c46e55e154b7c705 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 10 Nov 2025 07:44:08 -0800
Subject: [PATCH 19/28] [MLIR][Python] add/fix docstrings in IRCore (#167063)

This PR adds all the missing doc strings in IRCore.cpp. It also

1. Normalizes all doc strings to have proper punctuation;
2. Inlines non-duplicated docstrings which are currently at the top of
the source file (and thereby possibly out of sync).

Follow-up PRs will do the same for the rest of the modules/source files.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 mlir/lib/Bindings/Python/IRCore.cpp | 1812 +++++++++++++++++----------
 1 file changed, 1116 insertions(+), 696 deletions(-)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index cda4fe19c16f8..d90f27bd037e6 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -32,33 +32,6 @@ using llvm::SmallVector;
 using llvm::StringRef;
 using llvm::Twine;
 
-//------------------------------------------------------------------------------
-// Docstrings (trivial, non-duplicated docstrings are included inline).
-//------------------------------------------------------------------------------
-
-static const char kContextParseTypeDocstring[] =
-    R"(Parses the assembly form of a type.
-
-Returns a Type object or raises an MLIRError if the type cannot be parsed.
-
-See also: https://mlir.llvm.org/docs/LangRef/#type-system
-)";
-
-static const char kContextGetCallSiteLocationDocstring[] =
-    R"(Gets a Location representing a caller and callsite)";
-
-static const char kContextGetFileLocationDocstring[] =
-    R"(Gets a Location representing a file, line and column)";
-
-static const char kContextGetFileRangeDocstring[] =
-    R"(Gets a Location representing a file, line and column range)";
-
-static const char kContextGetFusedLocationDocstring[] =
-    R"(Gets a Location representing a fused location with optional metadata)";
-
-static const char kContextGetNameLocationDocString[] =
-    R"(Gets a Location representing a named location with optional child location)";
-
 static const char kModuleParseDocstring[] =
     R"(Parses a module's assembly format from a string.
 
@@ -67,132 +40,12 @@ Returns a new MlirModule or raises an MLIRError if the parsing fails.
 See also: https://mlir.llvm.org/docs/LangRef/
 )";
 
-static const char kModuleCAPICreate[] =
-    R"(Creates a Module from a MlirModule wrapped by a capsule (i.e. module._CAPIPtr).
-Note this returns a new object BUT _clear_mlir_module(module) must be called to
-prevent double-frees (of the underlying mlir::Module).
-)";
-
-static const char kOperationCreateDocstring[] =
-    R"(Creates a new operation.
-
-Args:
-  name: Operation name (e.g. "dialect.operation").
-  results: Sequence of Type representing op result types.
-  attributes: Dict of str:Attribute.
-  successors: List of Block for the operation's successors.
-  regions: Number of regions to create.
-  location: A Location object (defaults to resolve from context manager).
-  ip: An InsertionPoint (defaults to resolve from context manager or set to
-    False to disable insertion, even with an insertion point set in the
-    context manager).
-  infer_type: Whether to infer result types.
-Returns:
-  A new "detached" Operation object. Detached operations can be added
-  to blocks, which causes them to become "attached."
-)";
-
-static const char kOperationPrintDocstring[] =
-    R"(Prints the assembly form of the operation to a file like object.
-
-Args:
-  file: The file like object to write to. Defaults to sys.stdout.
-  binary: Whether to write bytes (True) or str (False). Defaults to False.
-  large_elements_limit: Whether to elide elements attributes above this
-    number of elements. Defaults to None (no limit).
-  large_resource_limit: Whether to elide resource attributes above this
-    number of characters. Defaults to None (no limit). If large_elements_limit
-    is set and this is None, the behavior will be to use large_elements_limit
-    as large_resource_limit.
-  enable_debug_info: Whether to print debug/location information. Defaults
-    to False.
-  pretty_debug_info: Whether to format debug information for easier reading
-    by a human (warning: the result is unparseable).
-  print_generic_op_form: Whether to print the generic assembly forms of all
-    ops. Defaults to False.
-  use_local_Scope: Whether to print in a way that is more optimized for
-    multi-threaded access but may not be consistent with how the overall
-    module prints.
-  assume_verified: By default, if not printing generic form, the verifier
-    will be run and if it fails, generic form will be printed with a comment
-    about failed verification. While a reasonable default for interactive use,
-    for systematic use, it is often better for the caller to verify explicitly
-    and report failures in a more robust fashion. Set this to True if doing this
-    in order to avoid running a redundant verification. If the IR is actually
-    invalid, behavior is undefined.
-  skip_regions: Whether to skip printing regions. Defaults to False.
-)";
-
-static const char kOperationPrintStateDocstring[] =
-    R"(Prints the assembly form of the operation to a file like object.
-
-Args:
-  file: The file like object to write to. Defaults to sys.stdout.
-  binary: Whether to write bytes (True) or str (False). Defaults to False.
-  state: AsmState capturing the operation numbering and flags.
-)";
-
-static const char kOperationGetAsmDocstring[] =
-    R"(Gets the assembly form of the operation with all options available.
-
-Args:
-  binary: Whether to return a bytes (True) or str (False) object. Defaults to
-    False.
-  ... others ...: See the print() method for common keyword arguments for
-    configuring the printout.
-Returns:
-  Either a bytes or str object, depending on the setting of the 'binary'
-  argument.
-)";
-
-static const char kOperationPrintBytecodeDocstring[] =
-    R"(Write the bytecode form of the operation to a file like object.
-
-Args:
-  file: The file like object to write to.
-  desired_version: The version of bytecode to emit.
-Returns:
-  The bytecode writer status.
-)";
-
-static const char kOperationStrDunderDocstring[] =
-    R"(Gets the assembly form of the operation with default options.
-
-If more advanced control over the assembly formatting or I/O options is needed,
-use the dedicated print or get_asm method, which supports keyword arguments to
-customize behavior.
-)";
-
 static const char kDumpDocstring[] =
-    R"(Dumps a debug representation of the object to stderr.)";
-
-static const char kAppendBlockDocstring[] =
-    R"(Appends a new block, with argument types as positional args.
-
-Returns:
-  The created block.
-)";
-
-static const char kValueDunderStrDocstring[] =
-    R"(Returns the string form of the value.
-
-If the value is a block argument, this is the assembly form of its type and the
-position in the argument list. If the value is an operation result, this is
-equivalent to printing the operation that produced it.
-)";
-
-static const char kGetNameAsOperand[] =
-    R"(Returns the string form of value as an operand (i.e., the ValueID).
-)";
-
-static const char kValueReplaceAllUsesWithDocstring[] =
-    R"(Replace all uses of value with the new value, updating anything in
-the IR that uses 'self' to use the other value instead.
-)";
+    "Dumps a debug representation of the object to stderr.";
 
 static const char kValueReplaceAllUsesExceptDocstring[] =
-    R"("Replace all uses of this value with the 'with' value, except for those
-in 'exceptions'. 'exceptions' can be either a single operation or a list of
+    R"(Replace all uses of this value with the `with` value, except for those
+in `exceptions`. `exceptions` can be either a single operation or a list of
 operations.
 )";
 
@@ -274,22 +127,26 @@ struct PyGlobalDebugFlag {
     // Debug flags.
     nb::class_<PyGlobalDebugFlag>(m, "_GlobalDebug")
         .def_prop_rw_static("flag", &PyGlobalDebugFlag::get,
-                            &PyGlobalDebugFlag::set, "LLVM-wide debug flag")
+                            &PyGlobalDebugFlag::set, "LLVM-wide debug flag.")
         .def_static(
             "set_types",
             [](const std::string &type) {
               nb::ft_lock_guard lock(mutex);
               mlirSetGlobalDebugType(type.c_str());
             },
-            "types"_a, "Sets specific debug types to be produced by LLVM")
-        .def_static("set_types", [](const std::vector<std::string> &types) {
-          std::vector<const char *> pointers;
-          pointers.reserve(types.size());
-          for (const std::string &str : types)
-            pointers.push_back(str.c_str());
-          nb::ft_lock_guard lock(mutex);
-          mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
-        });
+            "types"_a, "Sets specific debug types to be produced by LLVM.")
+        .def_static(
+            "set_types",
+            [](const std::vector<std::string> &types) {
+              std::vector<const char *> pointers;
+              pointers.reserve(types.size());
+              for (const std::string &str : types)
+                pointers.push_back(str.c_str());
+              nb::ft_lock_guard lock(mutex);
+              mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
+            },
+            "types"_a,
+            "Sets multiple specific debug types to be produced by LLVM.");
   }
 
 private:
@@ -316,12 +173,18 @@ struct PyAttrBuilderMap {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyAttrBuilderMap>(m, "AttrBuilder")
-        .def_static("contains", &PyAttrBuilderMap::dunderContains)
-        .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed)
+        .def_static("contains", &PyAttrBuilderMap::dunderContains,
+                    "attribute_kind"_a,
+                    "Checks whether an attribute builder is registered for the "
+                    "given attribute kind.")
+        .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed,
+                    "attribute_kind"_a,
+                    "Gets the registered attribute builder for the given "
+                    "attribute kind.")
         .def_static("insert", &PyAttrBuilderMap::dunderSetItemNamed,
                     "attribute_kind"_a, "attr_builder"_a, "replace"_a = false,
                     "Register an attribute builder for building MLIR "
-                    "attributes from python values.");
+                    "attributes from Python values.");
   }
 };
 
@@ -357,8 +220,10 @@ class PyRegionIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyRegionIterator>(m, "RegionIterator")
-        .def("__iter__", &PyRegionIterator::dunderIter)
-        .def("__next__", &PyRegionIterator::dunderNext);
+        .def("__iter__", &PyRegionIterator::dunderIter,
+             "Returns an iterator over the regions in the operation.")
+        .def("__next__", &PyRegionIterator::dunderNext,
+             "Returns the next region in the iteration.");
   }
 
 private:
@@ -386,7 +251,8 @@ class PyRegionList : public Sliceable<PyRegionList, PyRegion> {
   }
 
   static void bindDerived(ClassTy &c) {
-    c.def("__iter__", &PyRegionList::dunderIter);
+    c.def("__iter__", &PyRegionList::dunderIter,
+          "Returns an iterator over the regions in the sequence.");
   }
 
 private:
@@ -430,8 +296,10 @@ class PyBlockIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyBlockIterator>(m, "BlockIterator")
-        .def("__iter__", &PyBlockIterator::dunderIter)
-        .def("__next__", &PyBlockIterator::dunderNext);
+        .def("__iter__", &PyBlockIterator::dunderIter,
+             "Returns an iterator over the blocks in the operation's region.")
+        .def("__next__", &PyBlockIterator::dunderNext,
+             "Returns the next block in the iteration.");
   }
 
 private:
@@ -493,10 +361,19 @@ class PyBlockList {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyBlockList>(m, "BlockList")
-        .def("__getitem__", &PyBlockList::dunderGetItem)
-        .def("__iter__", &PyBlockList::dunderIter)
-        .def("__len__", &PyBlockList::dunderLen)
-        .def("append", &PyBlockList::appendBlock, kAppendBlockDocstring,
+        .def("__getitem__", &PyBlockList::dunderGetItem,
+             "Returns the block at the specified index.")
+        .def("__iter__", &PyBlockList::dunderIter,
+             "Returns an iterator over blocks in the operation's region.")
+        .def("__len__", &PyBlockList::dunderLen,
+             "Returns the number of blocks in the operation's region.")
+        .def("append", &PyBlockList::appendBlock,
+             R"(
+              Appends a new block, with argument types as positional args.
+
+              Returns:
+                The created block.
+             )",
              nb::arg("args"), nb::kw_only(),
              nb::arg("arg_locs") = std::nullopt);
   }
@@ -527,8 +404,10 @@ class PyOperationIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOperationIterator>(m, "OperationIterator")
-        .def("__iter__", &PyOperationIterator::dunderIter)
-        .def("__next__", &PyOperationIterator::dunderNext);
+        .def("__iter__", &PyOperationIterator::dunderIter,
+             "Returns an iterator over the operations in an operation's block.")
+        .def("__next__", &PyOperationIterator::dunderNext,
+             "Returns the next operation in the iteration.");
   }
 
 private:
@@ -584,9 +463,12 @@ class PyOperationList {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOperationList>(m, "OperationList")
-        .def("__getitem__", &PyOperationList::dunderGetItem)
-        .def("__iter__", &PyOperationList::dunderIter)
-        .def("__len__", &PyOperationList::dunderLen);
+        .def("__getitem__", &PyOperationList::dunderGetItem,
+             "Returns the operation at the specified index.")
+        .def("__iter__", &PyOperationList::dunderIter,
+             "Returns an iterator over operations in the list.")
+        .def("__len__", &PyOperationList::dunderLen,
+             "Returns the number of operations in the list.");
   }
 
 private:
@@ -609,8 +491,10 @@ class PyOpOperand {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOpOperand>(m, "OpOperand")
-        .def_prop_ro("owner", &PyOpOperand::getOwner)
-        .def_prop_ro("operand_number", &PyOpOperand::getOperandNumber);
+        .def_prop_ro("owner", &PyOpOperand::getOwner,
+                     "Returns the operation that owns this operand.")
+        .def_prop_ro("operand_number", &PyOpOperand::getOperandNumber,
+                     "Returns the operand number in the owning operation.");
   }
 
 private:
@@ -634,8 +518,10 @@ class PyOpOperandIterator {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOpOperandIterator>(m, "OpOperandIterator")
-        .def("__iter__", &PyOpOperandIterator::dunderIter)
-        .def("__next__", &PyOpOperandIterator::dunderNext);
+        .def("__iter__", &PyOpOperandIterator::dunderIter,
+             "Returns an iterator over operands.")
+        .def("__next__", &PyOpOperandIterator::dunderNext,
+             "Returns the next operand in the iteration.");
   }
 
 private:
@@ -1626,16 +1512,21 @@ class PyOpResult : public PyConcreteValue<PyOpResult> {
 
   static void bindDerived(ClassTy &c) {
     c.def_prop_ro(
-        "owner", [](PyOpResult &self) -> nb::typed<nb::object, PyOperation> {
+        "owner",
+        [](PyOpResult &self) -> nb::typed<nb::object, PyOperation> {
           assert(mlirOperationEqual(self.getParentOperation()->get(),
                                     mlirOpResultGetOwner(self.get())) &&
                  "expected the owner of the value in Python to match that in "
                  "the IR");
           return self.getParentOperation().getObject();
-        });
-    c.def_prop_ro("result_number", [](PyOpResult &self) {
-      return mlirOpResultGetResultNumber(self.get());
-    });
+        },
+        "Returns the operation that produces this result.");
+    c.def_prop_ro(
+        "result_number",
+        [](PyOpResult &self) {
+          return mlirOpResultGetResultNumber(self.get());
+        },
+        "Returns the position of this result in the operation's result list.");
   }
 };
 
@@ -1671,13 +1562,18 @@ class PyOpResultList : public Sliceable<PyOpResultList, PyOpResult> {
         operation(std::move(operation)) {}
 
   static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("types", [](PyOpResultList &self) {
-      return getValueTypes(self, self.operation->getContext());
-    });
-    c.def_prop_ro("owner",
-                  [](PyOpResultList &self) -> nb::typed<nb::object, PyOpView> {
-                    return self.operation->createOpView();
-                  });
+    c.def_prop_ro(
+        "types",
+        [](PyOpResultList &self) {
+          return getValueTypes(self, self.operation->getContext());
+        },
+        "Returns a list of types for all results in this result list.");
+    c.def_prop_ro(
+        "owner",
+        [](PyOpResultList &self) -> nb::typed<nb::object, PyOpView> {
+          return self.operation->createOpView();
+        },
+        "Returns the operation that owns this result list.");
   }
 
   PyOperationRef &getOperation() { return operation; }
@@ -2427,19 +2323,25 @@ class PyBlockArgument : public PyConcreteValue<PyBlockArgument> {
   using PyConcreteValue::PyConcreteValue;
 
   static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("owner", [](PyBlockArgument &self) {
-      return PyBlock(self.getParentOperation(),
-                     mlirBlockArgumentGetOwner(self.get()));
-    });
-    c.def_prop_ro("arg_number", [](PyBlockArgument &self) {
-      return mlirBlockArgumentGetArgNumber(self.get());
-    });
+    c.def_prop_ro(
+        "owner",
+        [](PyBlockArgument &self) {
+          return PyBlock(self.getParentOperation(),
+                         mlirBlockArgumentGetOwner(self.get()));
+        },
+        "Returns the block that owns this argument.");
+    c.def_prop_ro(
+        "arg_number",
+        [](PyBlockArgument &self) {
+          return mlirBlockArgumentGetArgNumber(self.get());
+        },
+        "Returns the position of this argument in the block's argument list.");
     c.def(
         "set_type",
         [](PyBlockArgument &self, PyType type) {
           return mlirBlockArgumentSetType(self.get(), type);
         },
-        nb::arg("type"));
+        nb::arg("type"), "Sets the type of this block argument.");
   }
 };
 
@@ -2462,9 +2364,12 @@ class PyBlockArgumentList
         operation(std::move(operation)), block(block) {}
 
   static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("types", [](PyBlockArgumentList &self) {
-      return getValueTypes(self, self.operation->getContext());
-    });
+    c.def_prop_ro(
+        "types",
+        [](PyBlockArgumentList &self) {
+          return getValueTypes(self, self.operation->getContext());
+        },
+        "Returns a list of types for all arguments in this argument list.");
   }
 
 private:
@@ -2516,7 +2421,9 @@ class PyOpOperandList : public Sliceable<PyOpOperandList, PyValue> {
   }
 
   static void bindDerived(ClassTy &c) {
-    c.def("__setitem__", &PyOpOperandList::dunderSetItem);
+    c.def("__setitem__", &PyOpOperandList::dunderSetItem, nb::arg("index"),
+          nb::arg("value"),
+          "Sets the operand at the specified index to a new value.");
   }
 
 private:
@@ -2571,7 +2478,8 @@ class PyOpSuccessors : public Sliceable<PyOpSuccessors, PyBlock> {
   }
 
   static void bindDerived(ClassTy &c) {
-    c.def("__setitem__", &PyOpSuccessors::dunderSetItem);
+    c.def("__setitem__", &PyOpSuccessors::dunderSetItem, nb::arg("index"),
+          nb::arg("block"), "Sets the successor block at the specified index.");
   }
 
 private:
@@ -2743,55 +2651,70 @@ class PyOpAttributeMap {
 
   static void bind(nb::module_ &m) {
     nb::class_<PyOpAttributeMap>(m, "OpAttributeMap")
-        .def("__contains__", &PyOpAttributeMap::dunderContains)
-        .def("__len__", &PyOpAttributeMap::dunderLen)
-        .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed)
-        .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed)
-        .def("__setitem__", &PyOpAttributeMap::dunderSetItem)
-        .def("__delitem__", &PyOpAttributeMap::dunderDelItem)
-        .def("__iter__",
-             [](PyOpAttributeMap &self) {
-               nb::list keys;
-               PyOpAttributeMap::forEachAttr(
-                   self.operation->get(),
-                   [&](MlirStringRef name, MlirAttribute) {
-                     keys.append(nb::str(name.data, name.length));
-                   });
-               return nb::iter(keys);
-             })
-        .def("keys",
-             [](PyOpAttributeMap &self) {
-               nb::list out;
-               PyOpAttributeMap::forEachAttr(
-                   self.operation->get(),
-                   [&](MlirStringRef name, MlirAttribute) {
-                     out.append(nb::str(name.data, name.length));
-                   });
-               return out;
-             })
-        .def("values",
-             [](PyOpAttributeMap &self) {
-               nb::list out;
-               PyOpAttributeMap::forEachAttr(
-                   self.operation->get(),
-                   [&](MlirStringRef, MlirAttribute attr) {
-                     out.append(PyAttribute(self.operation->getContext(), attr)
-                                    .maybeDownCast());
-                   });
-               return out;
-             })
-        .def("items", [](PyOpAttributeMap &self) {
-          nb::list out;
-          PyOpAttributeMap::forEachAttr(
-              self.operation->get(),
-              [&](MlirStringRef name, MlirAttribute attr) {
-                out.append(nb::make_tuple(
-                    nb::str(name.data, name.length),
-                    PyAttribute(self.operation->getContext(), attr)
-                        .maybeDownCast()));
-              });
-          return out;
-        });
+        .def("__contains__", &PyOpAttributeMap::dunderContains, nb::arg("name"),
+             "Checks if an attribute with the given name exists in the map.")
+        .def("__len__", &PyOpAttributeMap::dunderLen,
+             "Returns the number of attributes in the map.")
+        .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed,
+             nb::arg("name"), "Gets an attribute by name.")
+        .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed,
+             nb::arg("index"), "Gets a named attribute by index.")
+        .def("__setitem__", &PyOpAttributeMap::dunderSetItem, nb::arg("name"),
+             nb::arg("attr"), "Sets an attribute with the given name.")
+        .def("__delitem__", &PyOpAttributeMap::dunderDelItem, nb::arg("name"),
+             "Deletes an attribute with the given name.")
+        .def(
+            "__iter__",
+            [](PyOpAttributeMap &self) {
+              nb::list keys;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef name, MlirAttribute) {
+                    keys.append(nb::str(name.data, name.length));
+                  });
+              return nb::iter(keys);
+            },
+            "Iterates over attribute names.")
+        .def(
+            "keys",
+            [](PyOpAttributeMap &self) {
+              nb::list out;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef name, MlirAttribute) {
+                    out.append(nb::str(name.data, name.length));
+                  });
+              return out;
+            },
+            "Returns a list of attribute names.")
+        .def(
+            "values",
+            [](PyOpAttributeMap &self) {
+              nb::list out;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef, MlirAttribute attr) {
+                    out.append(PyAttribute(self.operation->getContext(), attr)
+                                   .maybeDownCast());
+                  });
+              return out;
+            },
+            "Returns a list of attribute values.")
+        .def(
+            "items",
+            [](PyOpAttributeMap &self) {
+              nb::list out;
+              PyOpAttributeMap::forEachAttr(
+                  self.operation->get(),
+                  [&](MlirStringRef name, MlirAttribute attr) {
+                    out.append(nb::make_tuple(
+                        nb::str(name.data, name.length),
+                        PyAttribute(self.operation->getContext(), attr)
+                            .maybeDownCast()));
+                  });
+              return out;
+            },
+            "Returns a list of `(name, attribute)` tuples.");
   }
 
 private:
@@ -2979,62 +2902,103 @@ void mlir::python::populateIRCore(nb::module_ &m) {
   // Mapping of Diagnostics.
   //----------------------------------------------------------------------------
   nb::class_<PyDiagnostic>(m, "Diagnostic")
-      .def_prop_ro("severity", &PyDiagnostic::getSeverity)
-      .def_prop_ro("location", &PyDiagnostic::getLocation)
-      .def_prop_ro("message", &PyDiagnostic::getMessage)
-      .def_prop_ro("notes", &PyDiagnostic::getNotes)
-      .def("__str__", [](PyDiagnostic &self) -> nb::str {
-        if (!self.isValid())
-          return nb::str("<Invalid Diagnostic>");
-        return self.getMessage();
-      });
+      .def_prop_ro("severity", &PyDiagnostic::getSeverity,
+                   "Returns the severity of the diagnostic.")
+      .def_prop_ro("location", &PyDiagnostic::getLocation,
+                   "Returns the location associated with the diagnostic.")
+      .def_prop_ro("message", &PyDiagnostic::getMessage,
+                   "Returns the message text of the diagnostic.")
+      .def_prop_ro("notes", &PyDiagnostic::getNotes,
+                   "Returns a tuple of attached note diagnostics.")
+      .def(
+          "__str__",
+          [](PyDiagnostic &self) -> nb::str {
+            if (!self.isValid())
+              return nb::str("<Invalid Diagnostic>");
+            return self.getMessage();
+          },
+          "Returns the diagnostic message as a string.");
 
   nb::class_<PyDiagnostic::DiagnosticInfo>(m, "DiagnosticInfo")
-      .def("__init__",
-           [](PyDiagnostic::DiagnosticInfo &self, PyDiagnostic diag) {
-             new (&self) PyDiagnostic::DiagnosticInfo(diag.getInfo());
-           })
-      .def_ro("severity", &PyDiagnostic::DiagnosticInfo::severity)
-      .def_ro("location", &PyDiagnostic::DiagnosticInfo::location)
-      .def_ro("message", &PyDiagnostic::DiagnosticInfo::message)
-      .def_ro("notes", &PyDiagnostic::DiagnosticInfo::notes)
-      .def("__str__",
-           [](PyDiagnostic::DiagnosticInfo &self) { return self.message; });
+      .def(
+          "__init__",
+          [](PyDiagnostic::DiagnosticInfo &self, PyDiagnostic diag) {
+            new (&self) PyDiagnostic::DiagnosticInfo(diag.getInfo());
+          },
+          "diag"_a, "Creates a DiagnosticInfo from a Diagnostic.")
+      .def_ro("severity", &PyDiagnostic::DiagnosticInfo::severity,
+              "The severity level of the diagnostic.")
+      .def_ro("location", &PyDiagnostic::DiagnosticInfo::location,
+              "The location associated with the diagnostic.")
+      .def_ro("message", &PyDiagnostic::DiagnosticInfo::message,
+              "The message text of the diagnostic.")
+      .def_ro("notes", &PyDiagnostic::DiagnosticInfo::notes,
+              "List of attached note diagnostics.")
+      .def(
+          "__str__",
+          [](PyDiagnostic::DiagnosticInfo &self) { return self.message; },
+          "Returns the diagnostic message as a string.");
 
   nb::class_<PyDiagnosticHandler>(m, "DiagnosticHandler")
-      .def("detach", &PyDiagnosticHandler::detach)
-      .def_prop_ro("attached", &PyDiagnosticHandler::isAttached)
-      .def_prop_ro("had_error", &PyDiagnosticHandler::getHadError)
-      .def("__enter__", &PyDiagnosticHandler::contextEnter)
+      .def("detach", &PyDiagnosticHandler::detach,
+           "Detaches the diagnostic handler from the context.")
+      .def_prop_ro("attached", &PyDiagnosticHandler::isAttached,
+                   "Returns True if the handler is attached to a context.")
+      .def_prop_ro("had_error", &PyDiagnosticHandler::getHadError,
+                   "Returns True if an error was encountered during diagnostic "
+                   "handling.")
+      .def("__enter__", &PyDiagnosticHandler::contextEnter,
+           "Enters the diagnostic handler as a context manager.")
       .def("__exit__", &PyDiagnosticHandler::contextExit,
            nb::arg("exc_type").none(), nb::arg("exc_value").none(),
-           nb::arg("traceback").none());
+           nb::arg("traceback").none(),
+           "Exits the diagnostic handler context manager.");
 
   // Expose DefaultThreadPool to python
   nb::class_<PyThreadPool>(m, "ThreadPool")
-      .def("__init__", [](PyThreadPool &self) { new (&self) PyThreadPool(); })
-      .def("get_max_concurrency", &PyThreadPool::getMaxConcurrency)
-      .def("_mlir_thread_pool_ptr", &PyThreadPool::_mlir_thread_pool_ptr);
+      .def(
+          "__init__", [](PyThreadPool &self) { new (&self) PyThreadPool(); },
+          "Creates a new thread pool with default concurrency.")
+      .def("get_max_concurrency", &PyThreadPool::getMaxConcurrency,
+           "Returns the maximum number of threads in the pool.")
+      .def("_mlir_thread_pool_ptr", &PyThreadPool::_mlir_thread_pool_ptr,
+           "Returns the raw pointer to the LLVM thread pool as a string.");
 
   nb::class_<PyMlirContext>(m, "Context")
-      .def("__init__",
-           [](PyMlirContext &self) {
-             MlirContext context = mlirContextCreateWithThreading(false);
-             new (&self) PyMlirContext(context);
-           })
-      .def_static("_get_live_count", &PyMlirContext::getLiveCount)
-      .def("_get_context_again",
-           [](PyMlirContext &self) -> nb::typed<nb::object, PyMlirContext> {
-             PyMlirContextRef ref = PyMlirContext::forContext(self.get());
-             return ref.releaseObject();
-           })
-      .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount)
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule)
+      .def(
+          "__init__",
+          [](PyMlirContext &self) {
+            MlirContext context = mlirContextCreateWithThreading(false);
+            new (&self) PyMlirContext(context);
+          },
+          R"(
+            Creates a new MLIR context.
+
+            The context is the top-level container for all MLIR objects. It owns the storage
+            for types, attributes, locations, and other core IR objects. A context can be
+            configured to allow or disallow unregistered dialects and can have dialects
+            loaded on-demand.)")
+      .def_static("_get_live_count", &PyMlirContext::getLiveCount,
+                  "Gets the number of live Context objects.")
+      .def(
+          "_get_context_again",
+          [](PyMlirContext &self) -> nb::typed<nb::object, PyMlirContext> {
+            PyMlirContextRef ref = PyMlirContext::forContext(self.get());
+            return ref.releaseObject();
+          },
+          "Gets another reference to the same context.")
+      .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount,
+           "Gets the number of live modules owned by this context.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule,
+                   "Gets a capsule wrapping the MlirContext.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyMlirContext::createFromCapsule)
-      .def("__enter__", &PyMlirContext::contextEnter)
+                  &PyMlirContext::createFromCapsule,
+                  "Creates a Context from a capsule wrapping MlirContext.")
+      .def("__enter__", &PyMlirContext::contextEnter,
+           "Enters the context as a context manager.")
       .def("__exit__", &PyMlirContext::contextExit, nb::arg("exc_type").none(),
-           nb::arg("exc_value").none(), nb::arg("traceback").none())
+           nb::arg("exc_value").none(), nb::arg("traceback").none(),
+           "Exits the context manager.")
       .def_prop_ro_static(
           "current",
           [](nb::object & /*class*/)
@@ -3045,14 +3009,15 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return nb::cast(context);
           },
           nb::sig("def current(/) -> Context | None"),
-          "Gets the Context bound to the current thread or raises ValueError")
+          "Gets the Context bound to the current thread or returns None if no "
+          "context is set.")
       .def_prop_ro(
           "dialects",
           [](PyMlirContext &self) { return PyDialects(self.getRef()); },
-          "Gets a container for accessing dialects by name")
+          "Gets a container for accessing dialects by name.")
       .def_prop_ro(
           "d", [](PyMlirContext &self) { return PyDialects(self.getRef()); },
-          "Alias for 'dialect'")
+          "Alias for `dialects`.")
       .def(
           "get_dialect_descriptor",
           [=](PyMlirContext &self, std::string &name) {
@@ -3065,7 +3030,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyDialectDescriptor(self.getRef(), dialect);
           },
           nb::arg("dialect_name"),
-          "Gets or loads a dialect by name, returning its descriptor object")
+          "Gets or loads a dialect by name, returning its descriptor object.")
       .def_prop_rw(
           "allow_unregistered_dialects",
           [](PyMlirContext &self) -> bool {
@@ -3073,67 +3038,110 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           [](PyMlirContext &self, bool value) {
             mlirContextSetAllowUnregisteredDialects(self.get(), value);
-          })
+          },
+          "Controls whether unregistered dialects are allowed in this context.")
       .def("attach_diagnostic_handler", &PyMlirContext::attachDiagnosticHandler,
            nb::arg("callback"),
-           "Attaches a diagnostic handler that will receive callbacks")
+           "Attaches a diagnostic handler that will receive callbacks.")
       .def(
           "enable_multithreading",
           [](PyMlirContext &self, bool enable) {
             mlirContextEnableMultithreading(self.get(), enable);
           },
-          nb::arg("enable"))
-      .def("set_thread_pool",
-           [](PyMlirContext &self, PyThreadPool &pool) {
-             // we should disable multi-threading first before setting
-             // new thread pool otherwise the assert in
-             // MLIRContext::setThreadPool will be raised.
-             mlirContextEnableMultithreading(self.get(), false);
-             mlirContextSetThreadPool(self.get(), pool.get());
-           })
-      .def("get_num_threads",
-           [](PyMlirContext &self) {
-             return mlirContextGetNumThreads(self.get());
-           })
-      .def("_mlir_thread_pool_ptr",
-           [](PyMlirContext &self) {
-             MlirLlvmThreadPool pool = mlirContextGetThreadPool(self.get());
-             std::stringstream ss;
-             ss << pool.ptr;
-             return ss.str();
-           })
+          nb::arg("enable"),
+          R"(
+            Enables or disables multi-threading support in the context.
+
+            Args:
+              enable: Whether to enable (True) or disable (False) multi-threading.
+          )")
+      .def(
+          "set_thread_pool",
+          [](PyMlirContext &self, PyThreadPool &pool) {
+            // we should disable multi-threading first before setting
+            // new thread pool otherwise the assert in
+            // MLIRContext::setThreadPool will be raised.
+            mlirContextEnableMultithreading(self.get(), false);
+            mlirContextSetThreadPool(self.get(), pool.get());
+          },
+          R"(
+            Sets a custom thread pool for the context to use.
+
+            Args:
+              pool: A ThreadPool object to use for parallel operations.
+
+            Note:
+              Multi-threading is automatically disabled before setting the thread pool.)")
+      .def(
+          "get_num_threads",
+          [](PyMlirContext &self) {
+            return mlirContextGetNumThreads(self.get());
+          },
+          "Gets the number of threads in the context's thread pool.")
+      .def(
+          "_mlir_thread_pool_ptr",
+          [](PyMlirContext &self) {
+            MlirLlvmThreadPool pool = mlirContextGetThreadPool(self.get());
+            std::stringstream ss;
+            ss << pool.ptr;
+            return ss.str();
+          },
+          "Gets the raw pointer to the LLVM thread pool as a string.")
       .def(
           "is_registered_operation",
           [](PyMlirContext &self, std::string &name) {
             return mlirContextIsRegisteredOperation(
                 self.get(), MlirStringRef{name.data(), name.size()});
           },
-          nb::arg("operation_name"))
+          nb::arg("operation_name"),
+          R"(
+            Checks whether an operation with the given name is registered.
+
+            Args:
+              operation_name: The fully qualified name of the operation (e.g., `arith.addf`).
+
+            Returns:
+              True if the operation is registered, False otherwise.)")
       .def(
           "append_dialect_registry",
           [](PyMlirContext &self, PyDialectRegistry &registry) {
             mlirContextAppendDialectRegistry(self.get(), registry);
           },
-          nb::arg("registry"))
+          nb::arg("registry"),
+          R"(
+            Appends the contents of a dialect registry to the context.
+
+            Args:
+              registry: A DialectRegistry containing dialects to append.)")
       .def_prop_rw("emit_error_diagnostics",
                    &PyMlirContext::getEmitErrorDiagnostics,
                    &PyMlirContext::setEmitErrorDiagnostics,
-                   "Emit error diagnostics to diagnostic handlers. By default "
-                   "error diagnostics are captured and reported through "
-                   "MLIRError exceptions.")
-      .def("load_all_available_dialects", [](PyMlirContext &self) {
-        mlirContextLoadAllAvailableDialects(self.get());
-      });
+                   R"(
+            Controls whether error diagnostics are emitted to diagnostic handlers.
+
+            By default, error diagnostics are captured and reported through MLIRError exceptions.)")
+      .def(
+          "load_all_available_dialects",
+          [](PyMlirContext &self) {
+            mlirContextLoadAllAvailableDialects(self.get());
+          },
+          R"(
+            Loads all dialects available in the registry into the context.
+
+            This eagerly loads all dialects that have been registered, making them
+            immediately available for use.)");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialectDescriptor
   //----------------------------------------------------------------------------
   nb::class_<PyDialectDescriptor>(m, "DialectDescriptor")
-      .def_prop_ro("namespace",
-                   [](PyDialectDescriptor &self) {
-                     MlirStringRef ns = mlirDialectGetNamespace(self.get());
-                     return nb::str(ns.data, ns.length);
-                   })
+      .def_prop_ro(
+          "namespace",
+          [](PyDialectDescriptor &self) {
+            MlirStringRef ns = mlirDialectGetNamespace(self.get());
+            return nb::str(ns.data, ns.length);
+          },
+          "Returns the namespace of the dialect.")
       .def(
           "__repr__",
           [](PyDialectDescriptor &self) {
@@ -3143,35 +3151,43 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             repr.append(">");
             return repr;
           },
-          nb::sig("def __repr__(self) -> str"));
+          nb::sig("def __repr__(self) -> str"),
+          "Returns a string representation of the dialect descriptor.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialects
   //----------------------------------------------------------------------------
   nb::class_<PyDialects>(m, "Dialects")
-      .def("__getitem__",
-           [=](PyDialects &self, std::string keyName) {
-             MlirDialect dialect =
-                 self.getDialectForKey(keyName, /*attrError=*/false);
-             nb::object descriptor =
-                 nb::cast(PyDialectDescriptor{self.getContext(), dialect});
-             return createCustomDialectWrapper(keyName, std::move(descriptor));
-           })
-      .def("__getattr__", [=](PyDialects &self, std::string attrName) {
-        MlirDialect dialect =
-            self.getDialectForKey(attrName, /*attrError=*/true);
-        nb::object descriptor =
-            nb::cast(PyDialectDescriptor{self.getContext(), dialect});
-        return createCustomDialectWrapper(attrName, std::move(descriptor));
-      });
+      .def(
+          "__getitem__",
+          [=](PyDialects &self, std::string keyName) {
+            MlirDialect dialect =
+                self.getDialectForKey(keyName, /*attrError=*/false);
+            nb::object descriptor =
+                nb::cast(PyDialectDescriptor{self.getContext(), dialect});
+            return createCustomDialectWrapper(keyName, std::move(descriptor));
+          },
+          "Gets a dialect by name using subscript notation.")
+      .def(
+          "__getattr__",
+          [=](PyDialects &self, std::string attrName) {
+            MlirDialect dialect =
+                self.getDialectForKey(attrName, /*attrError=*/true);
+            nb::object descriptor =
+                nb::cast(PyDialectDescriptor{self.getContext(), dialect});
+            return createCustomDialectWrapper(attrName, std::move(descriptor));
+          },
+          "Gets a dialect by name using attribute notation.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialect
   //----------------------------------------------------------------------------
   nb::class_<PyDialect>(m, "Dialect")
-      .def(nb::init<nb::object>(), nb::arg("descriptor"))
-      .def_prop_ro("descriptor",
-                   [](PyDialect &self) { return self.getDescriptor(); })
+      .def(nb::init<nb::object>(), nb::arg("descriptor"),
+           "Creates a Dialect from a DialectDescriptor.")
+      .def_prop_ro(
+          "descriptor", [](PyDialect &self) { return self.getDescriptor(); },
+          "Returns the DialectDescriptor for this dialect.")
       .def(
           "__repr__",
           [](const nb::object &self) {
@@ -3181,31 +3197,43 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                    nb::str(" (class ") + clazz.attr("__module__") +
                    nb::str(".") + clazz.attr("__name__") + nb::str(")>");
           },
-          nb::sig("def __repr__(self) -> str"));
+          nb::sig("def __repr__(self) -> str"),
+          "Returns a string representation of the dialect.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyDialectRegistry
   //----------------------------------------------------------------------------
   nb::class_<PyDialectRegistry>(m, "DialectRegistry")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule,
+                   "Gets a capsule wrapping the MlirDialectRegistry.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyDialectRegistry::createFromCapsule)
-      .def(nb::init<>());
+                  &PyDialectRegistry::createFromCapsule,
+                  "Creates a DialectRegistry from a capsule wrapping "
+                  "`MlirDialectRegistry`.")
+      .def(nb::init<>(), "Creates a new empty dialect registry.");
 
   //----------------------------------------------------------------------------
   // Mapping of Location
   //----------------------------------------------------------------------------
   nb::class_<PyLocation>(m, "Location")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule)
-      .def("__enter__", &PyLocation::contextEnter)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule,
+                   "Gets a capsule wrapping the MlirLocation.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule,
+                  "Creates a Location from a capsule wrapping MlirLocation.")
+      .def("__enter__", &PyLocation::contextEnter,
+           "Enters the location as a context manager.")
       .def("__exit__", &PyLocation::contextExit, nb::arg("exc_type").none(),
-           nb::arg("exc_value").none(), nb::arg("traceback").none())
-      .def("__eq__",
-           [](PyLocation &self, PyLocation &other) -> bool {
-             return mlirLocationEqual(self, other);
-           })
-      .def("__eq__", [](PyLocation &self, nb::object other) { return false; })
+           nb::arg("exc_value").none(), nb::arg("traceback").none(),
+           "Exits the location context manager.")
+      .def(
+          "__eq__",
+          [](PyLocation &self, PyLocation &other) -> bool {
+            return mlirLocationEqual(self, other);
+          },
+          "Compares two locations for equality.")
+      .def(
+          "__eq__", [](PyLocation &self, nb::object other) { return false; },
+          "Compares location with non-location object (always returns False).")
       .def_prop_ro_static(
           "current",
           [](nb::object & /*class*/) -> std::optional<PyLocation *> {
@@ -3217,7 +3245,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           // clang-format off
           nb::sig("def current(/) -> Location | None"),
           // clang-format on
-          "Gets the Location bound to the current thread or raises ValueError")
+          "Gets the Location bound to the current thread or raises ValueError.")
       .def_static(
           "unknown",
           [](DefaultingPyMlirContext context) {
@@ -3225,13 +3253,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                               mlirLocationUnknownGet(context->get()));
           },
           nb::arg("context") = nb::none(),
-          "Gets a Location representing an unknown location")
+          "Gets a Location representing an unknown location.")
       .def_static(
           "callsite",
           [](PyLocation callee, const std::vector<PyLocation> &frames,
              DefaultingPyMlirContext context) {
             if (frames.empty())
-              throw nb::value_error("No caller frames provided");
+              throw nb::value_error("No caller frames provided.");
             MlirLocation caller = frames.back().get();
             for (const PyLocation &frame :
                  llvm::reverse(llvm::ArrayRef(frames).drop_back()))
@@ -3240,18 +3268,23 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                               mlirLocationCallSiteGet(callee.get(), caller));
           },
           nb::arg("callee"), nb::arg("frames"), nb::arg("context") = nb::none(),
-          kContextGetCallSiteLocationDocstring)
-      .def("is_a_callsite", mlirLocationIsACallSite)
-      .def_prop_ro("callee",
-                   [](PyLocation &self) {
-                     return PyLocation(self.getContext(),
-                                       mlirLocationCallSiteGetCallee(self));
-                   })
-      .def_prop_ro("caller",
-                   [](PyLocation &self) {
-                     return PyLocation(self.getContext(),
-                                       mlirLocationCallSiteGetCaller(self));
-                   })
+          "Gets a Location representing a caller and callsite.")
+      .def("is_a_callsite", mlirLocationIsACallSite,
+           "Returns True if this location is a CallSiteLoc.")
+      .def_prop_ro(
+          "callee",
+          [](PyLocation &self) {
+            return PyLocation(self.getContext(),
+                              mlirLocationCallSiteGetCallee(self));
+          },
+          "Gets the callee location from a CallSiteLoc.")
+      .def_prop_ro(
+          "caller",
+          [](PyLocation &self) {
+            return PyLocation(self.getContext(),
+                              mlirLocationCallSiteGetCaller(self));
+          },
+          "Gets the caller location from a CallSiteLoc.")
       .def_static(
           "file",
           [](std::string filename, int line, int col,
@@ -3262,7 +3295,8 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                     context->get(), toMlirStringRef(filename), line, col));
           },
           nb::arg("filename"), nb::arg("line"), nb::arg("col"),
-          nb::arg("context") = nb::none(), kContextGetFileLocationDocstring)
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a file, line and column.")
       .def_static(
           "file",
           [](std::string filename, int startLine, int startCol, int endLine,
@@ -3274,17 +3308,25 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           nb::arg("filename"), nb::arg("start_line"), nb::arg("start_col"),
           nb::arg("end_line"), nb::arg("end_col"),
-          nb::arg("context") = nb::none(), kContextGetFileRangeDocstring)
-      .def("is_a_file", mlirLocationIsAFileLineColRange)
-      .def_prop_ro("filename",
-                   [](MlirLocation loc) {
-                     return mlirIdentifierStr(
-                         mlirLocationFileLineColRangeGetFilename(loc));
-                   })
-      .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine)
-      .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn)
-      .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine)
-      .def_prop_ro("end_col", mlirLocationFileLineColRangeGetEndColumn)
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a file, line and column range.")
+      .def("is_a_file", mlirLocationIsAFileLineColRange,
+           "Returns True if this location is a FileLineColLoc.")
+      .def_prop_ro(
+          "filename",
+          [](MlirLocation loc) {
+            return mlirIdentifierStr(
+                mlirLocationFileLineColRangeGetFilename(loc));
+          },
+          "Gets the filename from a FileLineColLoc.")
+      .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine,
+                   "Gets the start line number from a `FileLineColLoc`.")
+      .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn,
+                   "Gets the start column number from a `FileLineColLoc`.")
+      .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine,
+                   "Gets the end line number from a `FileLineColLoc`.")
+      .def_prop_ro("end_col", mlirLocationFileLineColRangeGetEndColumn,
+                   "Gets the end column number from a `FileLineColLoc`.")
       .def_static(
           "fused",
           [](const std::vector<PyLocation> &pyLocations,
@@ -3300,8 +3342,11 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyLocation(context->getRef(), location);
           },
           nb::arg("locations"), nb::arg("metadata") = nb::none(),
-          nb::arg("context") = nb::none(), kContextGetFusedLocationDocstring)
-      .def("is_a_fused", mlirLocationIsAFused)
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a fused location with optional "
+          "metadata.")
+      .def("is_a_fused", mlirLocationIsAFused,
+           "Returns True if this location is a `FusedLoc`.")
       .def_prop_ro(
           "locations",
           [](PyLocation &self) {
@@ -3314,7 +3359,8 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             for (unsigned i = 0; i < numLocations; ++i)
               pyLocations.emplace_back(self.getContext(), locations[i]);
             return pyLocations;
-          })
+          },
+          "Gets the list of locations from a `FusedLoc`.")
       .def_static(
           "name",
           [](std::string name, std::optional<PyLocation> childLoc,
@@ -3327,17 +3373,24 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                              : mlirLocationUnknownGet(context->get())));
           },
           nb::arg("name"), nb::arg("childLoc") = nb::none(),
-          nb::arg("context") = nb::none(), kContextGetNameLocationDocString)
-      .def("is_a_name", mlirLocationIsAName)
-      .def_prop_ro("name_str",
-                   [](MlirLocation loc) {
-                     return mlirIdentifierStr(mlirLocationNameGetName(loc));
-                   })
-      .def_prop_ro("child_loc",
-                   [](PyLocation &self) {
-                     return PyLocation(self.getContext(),
-                                       mlirLocationNameGetChildLoc(self));
-                   })
+          nb::arg("context") = nb::none(),
+          "Gets a Location representing a named location with optional child "
+          "location.")
+      .def("is_a_name", mlirLocationIsAName,
+           "Returns True if this location is a `NameLoc`.")
+      .def_prop_ro(
+          "name_str",
+          [](MlirLocation loc) {
+            return mlirIdentifierStr(mlirLocationNameGetName(loc));
+          },
+          "Gets the name string from a `NameLoc`.")
+      .def_prop_ro(
+          "child_loc",
+          [](PyLocation &self) {
+            return PyLocation(self.getContext(),
+                              mlirLocationNameGetChildLoc(self));
+          },
+          "Gets the child location from a `NameLoc`.")
       .def_static(
           "from_attr",
           [](PyAttribute &attribute, DefaultingPyMlirContext context) {
@@ -3345,41 +3398,59 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                               mlirLocationFromAttribute(attribute));
           },
           nb::arg("attribute"), nb::arg("context") = nb::none(),
-          "Gets a Location from a LocationAttr")
+          "Gets a Location from a `LocationAttr`.")
       .def_prop_ro(
           "context",
           [](PyLocation &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that owns the Location")
+          "Context that owns the `Location`.")
       .def_prop_ro(
           "attr",
           [](PyLocation &self) {
             return PyAttribute(self.getContext(),
                                mlirLocationGetAttribute(self));
           },
-          "Get the underlying LocationAttr")
+          "Get the underlying `LocationAttr`.")
       .def(
           "emit_error",
           [](PyLocation &self, std::string message) {
             mlirEmitError(self, message.c_str());
           },
-          nb::arg("message"), "Emits an error at this location")
-      .def("__repr__", [](PyLocation &self) {
-        PyPrintAccumulator printAccum;
-        mlirLocationPrint(self, printAccum.getCallback(),
-                          printAccum.getUserData());
-        return printAccum.join();
-      });
+          nb::arg("message"),
+          R"(
+            Emits an error diagnostic at this location.
+
+            Args:
+              message: The error message to emit.)")
+      .def(
+          "__repr__",
+          [](PyLocation &self) {
+            PyPrintAccumulator printAccum;
+            mlirLocationPrint(self, printAccum.getCallback(),
+                              printAccum.getUserData());
+            return printAccum.join();
+          },
+          "Returns the assembly representation of the location.");
 
   //----------------------------------------------------------------------------
   // Mapping of Module
   //----------------------------------------------------------------------------
   nb::class_<PyModule>(m, "Module", nb::is_weak_referenceable())
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule,
+                   "Gets a capsule wrapping the MlirModule.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyModule::createFromCapsule,
-                  kModuleCAPICreate)
-      .def("_clear_mlir_module", &PyModule::clearMlirModule)
+                  R"(
+                    Creates a Module from a `MlirModule` wrapped by a capsule (i.e. `module._CAPIPtr`).
+
+                    This returns a new object **BUT** `_clear_mlir_module(module)` must be called to
+                    prevent double-frees (of the underlying `mlir::Module`).)")
+      .def("_clear_mlir_module", &PyModule::clearMlirModule,
+           R"(
+             Clears the internal MLIR module reference.
+
+             This is used internally to prevent double-free when ownership is transferred
+             via the C API capsule mechanism. Not intended for normal use.)")
       .def_static(
           "parse",
           [](const std::string &moduleAsm, DefaultingPyMlirContext context)
@@ -3427,13 +3498,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             MlirModule module = mlirModuleCreateEmpty(pyLoc.get());
             return PyModule::forModule(module).releaseObject();
           },
-          nb::arg("loc") = nb::none(), "Creates an empty module")
+          nb::arg("loc") = nb::none(), "Creates an empty module.")
       .def_prop_ro(
           "context",
           [](PyModule &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that created the Module")
+          "Context that created the `Module`.")
       .def_prop_ro(
           "operation",
           [](PyModule &self) -> nb::typed<nb::object, PyOperation> {
@@ -3442,7 +3513,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                              self.getRef().releaseObject())
                 .releaseObject();
           },
-          "Accesses the module as an operation")
+          "Accesses the module as an operation.")
       .def_prop_ro(
           "body",
           [](PyModule &self) {
@@ -3452,7 +3523,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             PyBlock returnBlock(moduleOp, mlirModuleGetBody(self.get()));
             return returnBlock;
           },
-          "Return the block for this module")
+          "Return the block for this module.")
       .def(
           "dump",
           [](PyModule &self) {
@@ -3465,39 +3536,59 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             // Defer to the operation's __str__.
             return self.attr("operation").attr("__str__")();
           },
-          nb::sig("def __str__(self) -> str"), kOperationStrDunderDocstring)
+          nb::sig("def __str__(self) -> str"),
+          R"(
+            Gets the assembly form of the operation with default options.
+
+            If more advanced control over the assembly formatting or I/O options is needed,
+            use the dedicated print or get_asm method, which supports keyword arguments to
+            customize behavior.
+          )")
       .def(
           "__eq__",
           [](PyModule &self, PyModule &other) {
             return mlirModuleEqual(self.get(), other.get());
           },
-          "other"_a)
-      .def("__hash__",
-           [](PyModule &self) { return mlirModuleHashValue(self.get()); });
+          "other"_a, "Compares two modules for equality.")
+      .def(
+          "__hash__",
+          [](PyModule &self) { return mlirModuleHashValue(self.get()); },
+          "Returns the hash value of the module.");
 
   //----------------------------------------------------------------------------
   // Mapping of Operation.
   //----------------------------------------------------------------------------
   nb::class_<PyOperationBase>(m, "_OperationBase")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR,
-                   [](PyOperationBase &self) {
-                     return self.getOperation().getCapsule();
-                   })
-      .def("__eq__",
-           [](PyOperationBase &self, PyOperationBase &other) {
-             return mlirOperationEqual(self.getOperation().get(),
-                                       other.getOperation().get());
-           })
-      .def("__eq__",
-           [](PyOperationBase &self, nb::object other) { return false; })
-      .def("__hash__",
-           [](PyOperationBase &self) {
-             return mlirOperationHashValue(self.getOperation().get());
-           })
-      .def_prop_ro("attributes",
-                   [](PyOperationBase &self) {
-                     return PyOpAttributeMap(self.getOperation().getRef());
-                   })
+      .def_prop_ro(
+          MLIR_PYTHON_CAPI_PTR_ATTR,
+          [](PyOperationBase &self) {
+            return self.getOperation().getCapsule();
+          },
+          "Gets a capsule wrapping the `MlirOperation`.")
+      .def(
+          "__eq__",
+          [](PyOperationBase &self, PyOperationBase &other) {
+            return mlirOperationEqual(self.getOperation().get(),
+                                      other.getOperation().get());
+          },
+          "Compares two operations for equality.")
+      .def(
+          "__eq__",
+          [](PyOperationBase &self, nb::object other) { return false; },
+          "Compares operation with non-operation object (always returns "
+          "False).")
+      .def(
+          "__hash__",
+          [](PyOperationBase &self) {
+            return mlirOperationHashValue(self.getOperation().get());
+          },
+          "Returns the hash value of the operation.")
+      .def_prop_ro(
+          "attributes",
+          [](PyOperationBase &self) {
+            return PyOpAttributeMap(self.getOperation().getRef());
+          },
+          "Returns a dictionary-like map of operation attributes.")
       .def_prop_ro(
           "context",
           [](PyOperationBase &self) -> nb::typed<nb::object, PyMlirContext> {
@@ -3505,22 +3596,28 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             concreteOperation.checkValid();
             return concreteOperation.getContext().getObject();
           },
-          "Context that owns the Operation")
-      .def_prop_ro("name",
-                   [](PyOperationBase &self) {
-                     auto &concreteOperation = self.getOperation();
-                     concreteOperation.checkValid();
-                     MlirOperation operation = concreteOperation.get();
-                     return mlirIdentifierStr(mlirOperationGetName(operation));
-                   })
-      .def_prop_ro("operands",
-                   [](PyOperationBase &self) {
-                     return PyOpOperandList(self.getOperation().getRef());
-                   })
-      .def_prop_ro("regions",
-                   [](PyOperationBase &self) {
-                     return PyRegionList(self.getOperation().getRef());
-                   })
+          "Context that owns the operation.")
+      .def_prop_ro(
+          "name",
+          [](PyOperationBase &self) {
+            auto &concreteOperation = self.getOperation();
+            concreteOperation.checkValid();
+            MlirOperation operation = concreteOperation.get();
+            return mlirIdentifierStr(mlirOperationGetName(operation));
+          },
+          "Returns the fully qualified name of the operation.")
+      .def_prop_ro(
+          "operands",
+          [](PyOperationBase &self) {
+            return PyOpOperandList(self.getOperation().getRef());
+          },
+          "Returns the list of operation operands.")
+      .def_prop_ro(
+          "regions",
+          [](PyOperationBase &self) {
+            return PyRegionList(self.getOperation().getRef());
+          },
+          "Returns the list of operation regions.")
       .def_prop_ro(
           "results",
           [](PyOperationBase &self) {
@@ -3551,14 +3648,16 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                          "defined or derived from."),
           nb::for_setter("Sets the source location the operation was defined "
                          "or derived from."))
-      .def_prop_ro("parent",
-                   [](PyOperationBase &self)
-                       -> std::optional<nb::typed<nb::object, PyOperation>> {
-                     auto parent = self.getOperation().getParentOperation();
-                     if (parent)
-                       return parent->getObject();
-                     return {};
-                   })
+      .def_prop_ro(
+          "parent",
+          [](PyOperationBase &self)
+              -> std::optional<nb::typed<nb::object, PyOperation>> {
+            auto parent = self.getOperation().getParentOperation();
+            if (parent)
+              return parent->getObject();
+            return {};
+          },
+          "Returns the parent operation, or `None` if at top level.")
       .def(
           "__str__",
           [](PyOperationBase &self) {
@@ -3579,7 +3678,14 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::overload_cast<PyAsmState &, nb::object, bool>(
                &PyOperationBase::print),
            nb::arg("state"), nb::arg("file") = nb::none(),
-           nb::arg("binary") = false, kOperationPrintStateDocstring)
+           nb::arg("binary") = false,
+           R"(
+             Prints the assembly form of the operation to a file like object.
+
+             Args:
+               state: `AsmState` capturing the operation numbering and flags.
+               file: Optional file like object to write to. Defaults to sys.stdout.
+               binary: Whether to write `bytes` (True) or `str` (False). Defaults to False.)")
       .def("print",
            nb::overload_cast<std::optional<int64_t>, std::optional<int64_t>,
                              bool, bool, bool, bool, bool, bool, nb::object,
@@ -3594,10 +3700,47 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("use_name_loc_as_prefix") = false,
            nb::arg("assume_verified") = false, nb::arg("file") = nb::none(),
            nb::arg("binary") = false, nb::arg("skip_regions") = false,
-           kOperationPrintDocstring)
+           R"(
+             Prints the assembly form of the operation to a file like object.
+
+             Args:
+               large_elements_limit: Whether to elide elements attributes above this
+                 number of elements. Defaults to None (no limit).
+               large_resource_limit: Whether to elide resource attributes above this
+                 number of characters. Defaults to None (no limit). If large_elements_limit
+                 is set and this is None, the behavior will be to use large_elements_limit
+                 as large_resource_limit.
+               enable_debug_info: Whether to print debug/location information. Defaults
+                 to False.
+               pretty_debug_info: Whether to format debug information for easier reading
+                 by a human (warning: the result is unparseable). Defaults to False.
+               print_generic_op_form: Whether to print the generic assembly forms of all
+                 ops. Defaults to False.
+               use_local_scope: Whether to print in a way that is more optimized for
+                 multi-threaded access but may not be consistent with how the overall
+                 module prints.
+               use_name_loc_as_prefix: Whether to use location attributes (NameLoc) as
+                 prefixes for the SSA identifiers. Defaults to False.
+               assume_verified: By default, if not printing generic form, the verifier
+                 will be run and if it fails, generic form will be printed with a comment
+                 about failed verification. While a reasonable default for interactive use,
+                 for systematic use, it is often better for the caller to verify explicitly
+                 and report failures in a more robust fashion. Set this to True if doing this
+                 in order to avoid running a redundant verification. If the IR is actually
+                 invalid, behavior is undefined.
+               file: The file like object to write to. Defaults to sys.stdout.
+               binary: Whether to write bytes (True) or str (False). Defaults to False.
+               skip_regions: Whether to skip printing regions. Defaults to False.)")
       .def("write_bytecode", &PyOperationBase::writeBytecode, nb::arg("file"),
            nb::arg("desired_version") = nb::none(),
-           kOperationPrintBytecodeDocstring)
+           R"(
+             Write the bytecode form of the operation to a file like object.
+
+             Args:
+               file: The file like object to write to.
+               desired_version: Optional version of bytecode to emit.
+             Returns:
+               The bytecode writer status.)")
       .def("get_asm", &PyOperationBase::getAsm,
            // Careful: Lots of arguments must match up with get_asm method.
            nb::arg("binary") = false,
@@ -3609,7 +3752,17 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("use_local_scope") = false,
            nb::arg("use_name_loc_as_prefix") = false,
            nb::arg("assume_verified") = false, nb::arg("skip_regions") = false,
-           kOperationGetAsmDocstring)
+           R"(
+            Gets the assembly form of the operation with all options available.
+
+            Args:
+              binary: Whether to return a bytes (True) or str (False) object. Defaults to
+                False.
+              ... others ...: See the print() method for common keyword arguments for
+                configuring the printout.
+            Returns:
+              Either a bytes or str object, depending on the setting of the `binary`
+              argument.)")
       .def("verify", &PyOperationBase::verify,
            "Verify the operation. Raises MLIRError if verification fails, and "
            "returns true otherwise.")
@@ -3621,18 +3774,31 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            "block.")
       .def("is_before_in_block", &PyOperationBase::isBeforeInBlock,
            nb::arg("other"),
-           "Given an operation 'other' that is within the same parent block, "
-           "return"
-           "whether the current operation is before 'other' in the operation "
-           "list"
-           "of the parent block.")
+           R"(
+             Checks if this operation is before another in the same block.
+
+             Args:
+               other: Another operation in the same parent block.
+
+             Returns:
+               True if this operation is before `other` in the operation list of the parent block.)")
       .def(
           "clone",
           [](PyOperationBase &self,
              const nb::object &ip) -> nb::typed<nb::object, PyOperation> {
             return self.getOperation().clone(ip);
           },
-          nb::arg("ip") = nb::none())
+          nb::arg("ip") = nb::none(),
+          R"(
+            Creates a deep copy of the operation.
+
+            Args:
+              ip: Optional insertion point where the cloned operation should be inserted.
+                If None, the current insertion point is used. If False, the operation
+                remains detached.
+
+            Returns:
+              A new Operation that is a clone of this operation.)")
       .def(
           "detach_from_parent",
           [](PyOperationBase &self) -> nb::typed<nb::object, PyOpView> {
@@ -3653,13 +3819,24 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return operation.isAttached();
           },
           "Reports if the operation is attached to its parent block.")
-      .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); })
+      .def(
+          "erase", [](PyOperationBase &self) { self.getOperation().erase(); },
+          R"(
+            Erases the operation and frees its memory.
+
+            Note:
+              After erasing, any Python references to the operation become invalid.)")
       .def("walk", &PyOperationBase::walk, nb::arg("callback"),
            nb::arg("walk_order") = MlirWalkPostOrder,
            // clang-format off
-          nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None")
+          nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None"),
            // clang-format on
-      );
+           R"(
+             Walks the operation tree with a callback function.
+
+             Args:
+               callback: A callable that takes an Operation and returns a WalkResult.
+               walk_order: The order of traversal (PRE_ORDER or POST_ORDER).)");
 
   nb::class_<PyOperation, PyOperationBase>(m, "Operation")
       .def_static(
@@ -3692,7 +3869,22 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           nb::arg("operands") = nb::none(), nb::arg("attributes") = nb::none(),
           nb::arg("successors") = nb::none(), nb::arg("regions") = 0,
           nb::arg("loc") = nb::none(), nb::arg("ip") = nb::none(),
-          nb::arg("infer_type") = false, kOperationCreateDocstring)
+          nb::arg("infer_type") = false,
+          R"(
+            Creates a new operation.
+
+            Args:
+              name: Operation name (e.g. `dialect.operation`).
+              results: Optional sequence of Type representing op result types.
+              operands: Optional operands of the operation.
+              attributes: Optional Dict of {str: Attribute}.
+              successors: Optional List of Block for the operation's successors.
+              regions: Number of regions to create (default = 0).
+              location: Optional Location object (defaults to resolve from context manager).
+              ip: Optional InsertionPoint (defaults to resolve from context manager or set to False to disable insertion, even with an insertion point set in the context manager).
+              infer_type: Whether to infer result types (default = False).
+            Returns:
+              A new detached Operation object. Detached operations can be added to blocks, which causes them to become attached.)")
       .def_static(
           "parse",
           [](const std::string &sourceStr, const std::string &sourceName,
@@ -3705,18 +3897,30 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           nb::arg("context") = nb::none(),
           "Parses an operation. Supports both text assembly format and binary "
           "bytecode format.")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule,
+                   "Gets a capsule wrapping the MlirOperation.")
       .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyOperation::createFromCapsule)
-      .def_prop_ro("operation",
-                   [](nb::object self) -> nb::typed<nb::object, PyOperation> {
-                     return self;
-                   })
-      .def_prop_ro("opview",
-                   [](PyOperation &self) -> nb::typed<nb::object, PyOpView> {
-                     return self.createOpView();
-                   })
-      .def_prop_ro("block", &PyOperation::getBlock)
+                  &PyOperation::createFromCapsule,
+                  "Creates an Operation from a capsule wrapping MlirOperation.")
+      .def_prop_ro(
+          "operation",
+          [](nb::object self) -> nb::typed<nb::object, PyOperation> {
+            return self;
+          },
+          "Returns self (the operation).")
+      .def_prop_ro(
+          "opview",
+          [](PyOperation &self) -> nb::typed<nb::object, PyOpView> {
+            return self.createOpView();
+          },
+          R"(
+            Returns an OpView of this operation.
+
+            Note:
+              If the operation has a registered and loaded dialect then this OpView will
+              be concrete wrapper class.)")
+      .def_prop_ro("block", &PyOperation::getBlock,
+                   "Returns the block containing this operation.")
       .def_prop_ro(
           "successors",
           [](PyOperationBase &self) {
@@ -3830,7 +4034,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       },
       nb::arg("cls"), nb::arg("source"), nb::kw_only(),
       nb::arg("source_name") = "", nb::arg("context") = nb::none(),
-      "Parses a specific, generated OpView based on class level attributes");
+      "Parses a specific, generated OpView based on class level attributes.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyRegion.
@@ -3856,17 +4060,22 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyBlockIterator(self.getParentOperation(), firstBlock);
           },
           "Iterates over blocks in the region.")
-      .def("__eq__",
-           [](PyRegion &self, PyRegion &other) {
-             return self.get().ptr == other.get().ptr;
-           })
-      .def("__eq__", [](PyRegion &self, nb::object &other) { return false; });
+      .def(
+          "__eq__",
+          [](PyRegion &self, PyRegion &other) {
+            return self.get().ptr == other.get().ptr;
+          },
+          "Compares two regions for pointer equality.")
+      .def(
+          "__eq__", [](PyRegion &self, nb::object &other) { return false; },
+          "Compares region with non-region object (always returns False).");
 
   //----------------------------------------------------------------------------
   // Mapping of PyBlock.
   //----------------------------------------------------------------------------
   nb::class_<PyBlock>(m, "Block")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule,
+                   "Gets a capsule wrapping the MlirBlock.")
       .def_prop_ro(
           "owner",
           [](PyBlock &self) -> nb::typed<nb::object, PyOpView> {
@@ -3893,14 +4102,26 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                    mlirBlockAddArgument(self.get(), type, loc));
           },
           "type"_a, "loc"_a,
-          "Append an argument of the specified type to the block and returns "
-          "the newly added argument.")
+          R"(
+            Appends an argument of the specified type to the block.
+
+            Args:
+              type: The type of the argument to add.
+              loc: The source location for the argument.
+
+            Returns:
+              The newly added block argument.)")
       .def(
           "erase_argument",
           [](PyBlock &self, unsigned index) {
             return mlirBlockEraseArgument(self.get(), index);
           },
-          "Erase the argument at 'index' and remove it from the argument list.")
+          nb::arg("index"),
+          R"(
+            Erases the argument at the specified index.
+
+            Args:
+              index: The index of the argument to erase.)")
       .def_prop_ro(
           "operations",
           [](PyBlock &self) {
@@ -3928,7 +4149,14 @@ void mlir::python::populateIRCore(nb::module_ &m) {
               mlirBlockDetach(b);
             mlirRegionAppendOwnedBlock(region.get(), b);
           },
-          "Append this block to a region, transferring ownership if necessary")
+          nb::arg("region"),
+          R"(
+            Appends this block to a region.
+
+            Transfers ownership if the block is currently owned by another region.
+
+            Args:
+              region: The region to append the block to.)")
       .def(
           "create_before",
           [](PyBlock &self, const nb::args &pyArgTypes,
@@ -3969,15 +4197,21 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                        firstOperation);
           },
           "Iterates over operations in the block.")
-      .def("__eq__",
-           [](PyBlock &self, PyBlock &other) {
-             return self.get().ptr == other.get().ptr;
-           })
-      .def("__eq__", [](PyBlock &self, nb::object &other) { return false; })
-      .def("__hash__",
-           [](PyBlock &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+      .def(
+          "__eq__",
+          [](PyBlock &self, PyBlock &other) {
+            return self.get().ptr == other.get().ptr;
+          },
+          "Compares two blocks for pointer equality.")
+      .def(
+          "__eq__", [](PyBlock &self, nb::object &other) { return false; },
+          "Compares block with non-block object (always returns False).")
+      .def(
+          "__hash__",
+          [](PyBlock &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the block.")
       .def(
           "__str__",
           [](PyBlock &self) {
@@ -4000,8 +4234,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                 self.getParentOperation().getObject());
           },
           nb::arg("operation"),
-          "Appends an operation to this block. If the operation is currently "
-          "in another block, it will be moved.")
+          R"(
+            Appends an operation to this block.
+
+            If the operation is currently in another block, it will be moved.
+
+            Args:
+              operation: The operation to append to the block.)")
       .def_prop_ro(
           "successors",
           [](PyBlock &self) {
@@ -4022,10 +4261,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
   nb::class_<PyInsertionPoint>(m, "InsertionPoint")
       .def(nb::init<PyBlock &>(), nb::arg("block"),
            "Inserts after the last operation but still inside the block.")
-      .def("__enter__", &PyInsertionPoint::contextEnter)
+      .def("__enter__", &PyInsertionPoint::contextEnter,
+           "Enters the insertion point as a context manager.")
       .def("__exit__", &PyInsertionPoint::contextExit,
            nb::arg("exc_type").none(), nb::arg("exc_value").none(),
-           nb::arg("traceback").none())
+           nb::arg("traceback").none(),
+           "Exits the insertion point context manager.")
       .def_prop_ro_static(
           "current",
           [](nb::object & /*class*/) {
@@ -4036,20 +4277,50 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           nb::sig("def current(/) -> InsertionPoint"),
           "Gets the InsertionPoint bound to the current thread or raises "
-          "ValueError if none has been set")
+          "ValueError if none has been set.")
       .def(nb::init<PyOperationBase &>(), nb::arg("beforeOperation"),
            "Inserts before a referenced operation.")
       .def_static("at_block_begin", &PyInsertionPoint::atBlockBegin,
-                  nb::arg("block"), "Inserts at the beginning of the block.")
+                  nb::arg("block"),
+                  R"(
+                    Creates an insertion point at the beginning of a block.
+
+                    Args:
+                      block: The block at whose beginning operations should be inserted.
+
+                    Returns:
+                      An InsertionPoint at the block's beginning.)")
       .def_static("at_block_terminator", &PyInsertionPoint::atBlockTerminator,
-                  nb::arg("block"), "Inserts before the block terminator.")
+                  nb::arg("block"),
+                  R"(
+                    Creates an insertion point before a block's terminator.
+
+                    Args:
+                      block: The block whose terminator to insert before.
+
+                    Returns:
+                      An InsertionPoint before the terminator.
+
+                    Raises:
+                      ValueError: If the block has no terminator.)")
       .def_static("after", &PyInsertionPoint::after, nb::arg("operation"),
-                  "Inserts after the operation.")
+                  R"(
+                    Creates an insertion point immediately after an operation.
+
+                    Args:
+                      operation: The operation after which to insert.
+
+                    Returns:
+                      An InsertionPoint after the operation.)")
       .def("insert", &PyInsertionPoint::insert, nb::arg("operation"),
-           "Inserts an operation.")
+           R"(
+             Inserts an operation at this insertion point.
+
+             Args:
+               operation: The operation to insert.)")
       .def_prop_ro(
           "block", [](PyInsertionPoint &self) { return self.getBlock(); },
-          "Returns the block that this InsertionPoint points to.")
+          "Returns the block that this `InsertionPoint` points to.")
       .def_prop_ro(
           "ref_operation",
           [](PyInsertionPoint &self)
@@ -4061,7 +4332,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           "The reference operation before which new operations are "
           "inserted, or None if the insertion point is at the end of "
-          "the block");
+          "the block.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyAttribute.
@@ -4070,10 +4341,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       // Delegate to the PyAttribute copy constructor, which will also lifetime
       // extend the backing context which owns the MlirAttribute.
       .def(nb::init<PyAttribute &>(), nb::arg("cast_from_type"),
-           "Casts the passed attribute to the generic Attribute")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR,
-                  &PyAttribute::createFromCapsule)
+           "Casts the passed attribute to the generic `Attribute`.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule,
+                   "Gets a capsule wrapping the MlirAttribute.")
+      .def_static(
+          MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAttribute::createFromCapsule,
+          "Creates an Attribute from a capsule wrapping `MlirAttribute`.")
       .def_static(
           "parse",
           [](const std::string &attrSpec, DefaultingPyMlirContext context)
@@ -4086,33 +4359,49 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyAttribute(context.get()->getRef(), attr).maybeDownCast();
           },
           nb::arg("asm"), nb::arg("context") = nb::none(),
-          "Parses an attribute from an assembly form. Raises an MLIRError on "
+          "Parses an attribute from an assembly form. Raises an `MLIRError` on "
           "failure.")
       .def_prop_ro(
           "context",
           [](PyAttribute &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that owns the Attribute")
-      .def_prop_ro("type",
-                   [](PyAttribute &self) -> nb::typed<nb::object, PyType> {
-                     return PyType(self.getContext(),
-                                   mlirAttributeGetType(self))
-                         .maybeDownCast();
-                   })
+          "Context that owns the `Attribute`.")
+      .def_prop_ro(
+          "type",
+          [](PyAttribute &self) -> nb::typed<nb::object, PyType> {
+            return PyType(self.getContext(), mlirAttributeGetType(self))
+                .maybeDownCast();
+          },
+          "Returns the type of the `Attribute`.")
       .def(
           "get_named",
           [](PyAttribute &self, std::string name) {
             return PyNamedAttribute(self, std::move(name));
           },
-          nb::keep_alive<0, 1>(), "Binds a name to the attribute")
-      .def("__eq__",
-           [](PyAttribute &self, PyAttribute &other) { return self == other; })
-      .def("__eq__", [](PyAttribute &self, nb::object &other) { return false; })
-      .def("__hash__",
-           [](PyAttribute &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+          nb::keep_alive<0, 1>(),
+          R"(
+            Binds a name to the attribute, creating a `NamedAttribute`.
+
+            Args:
+              name: The name to bind to the `Attribute`.
+
+            Returns:
+              A `NamedAttribute` with the given name and this attribute.)")
+      .def(
+          "__eq__",
+          [](PyAttribute &self, PyAttribute &other) { return self == other; },
+          "Compares two attributes for equality.")
+      .def(
+          "__eq__", [](PyAttribute &self, nb::object &other) { return false; },
+          "Compares attribute with non-attribute object (always returns "
+          "False).")
+      .def(
+          "__hash__",
+          [](PyAttribute &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the attribute.")
       .def(
           "dump", [](PyAttribute &self) { mlirAttributeDump(self); },
           kDumpDocstring)
@@ -4125,61 +4414,69 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return printAccum.join();
           },
           "Returns the assembly form of the Attribute.")
-      .def("__repr__",
-           [](PyAttribute &self) {
-             // Generally, assembly formats are not printed for __repr__ because
-             // this can cause exceptionally long debug output and exceptions.
-             // However, attribute values are generally considered useful and
-             // are printed. This may need to be re-evaluated if debug dumps end
-             // up being excessive.
-             PyPrintAccumulator printAccum;
-             printAccum.parts.append("Attribute(");
-             mlirAttributePrint(self, printAccum.getCallback(),
-                                printAccum.getUserData());
-             printAccum.parts.append(")");
-             return printAccum.join();
-           })
-      .def_prop_ro("typeid",
-                   [](PyAttribute &self) {
-                     MlirTypeID mlirTypeID = mlirAttributeGetTypeID(self);
-                     assert(!mlirTypeIDIsNull(mlirTypeID) &&
-                            "mlirTypeID was expected to be non-null.");
-                     return PyTypeID(mlirTypeID);
-                   })
-      .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-           [](PyAttribute &self) -> nb::typed<nb::object, PyAttribute> {
-             return self.maybeDownCast();
-           });
+      .def(
+          "__repr__",
+          [](PyAttribute &self) {
+            // Generally, assembly formats are not printed for __repr__ because
+            // this can cause exceptionally long debug output and exceptions.
+            // However, attribute values are generally considered useful and
+            // are printed. This may need to be re-evaluated if debug dumps end
+            // up being excessive.
+            PyPrintAccumulator printAccum;
+            printAccum.parts.append("Attribute(");
+            mlirAttributePrint(self, printAccum.getCallback(),
+                               printAccum.getUserData());
+            printAccum.parts.append(")");
+            return printAccum.join();
+          },
+          "Returns a string representation of the attribute.")
+      .def_prop_ro(
+          "typeid",
+          [](PyAttribute &self) {
+            MlirTypeID mlirTypeID = mlirAttributeGetTypeID(self);
+            assert(!mlirTypeIDIsNull(mlirTypeID) &&
+                   "mlirTypeID was expected to be non-null.");
+            return PyTypeID(mlirTypeID);
+          },
+          "Returns the `TypeID` of the attribute.")
+      .def(
+          MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+          [](PyAttribute &self) -> nb::typed<nb::object, PyAttribute> {
+            return self.maybeDownCast();
+          },
+          "Downcasts the attribute to a more specific attribute if possible.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyNamedAttribute
   //----------------------------------------------------------------------------
   nb::class_<PyNamedAttribute>(m, "NamedAttribute")
-      .def("__repr__",
-           [](PyNamedAttribute &self) {
-             PyPrintAccumulator printAccum;
-             printAccum.parts.append("NamedAttribute(");
-             printAccum.parts.append(
-                 nb::str(mlirIdentifierStr(self.namedAttr.name).data,
-                         mlirIdentifierStr(self.namedAttr.name).length));
-             printAccum.parts.append("=");
-             mlirAttributePrint(self.namedAttr.attribute,
-                                printAccum.getCallback(),
-                                printAccum.getUserData());
-             printAccum.parts.append(")");
-             return printAccum.join();
-           })
+      .def(
+          "__repr__",
+          [](PyNamedAttribute &self) {
+            PyPrintAccumulator printAccum;
+            printAccum.parts.append("NamedAttribute(");
+            printAccum.parts.append(
+                nb::str(mlirIdentifierStr(self.namedAttr.name).data,
+                        mlirIdentifierStr(self.namedAttr.name).length));
+            printAccum.parts.append("=");
+            mlirAttributePrint(self.namedAttr.attribute,
+                               printAccum.getCallback(),
+                               printAccum.getUserData());
+            printAccum.parts.append(")");
+            return printAccum.join();
+          },
+          "Returns a string representation of the named attribute.")
       .def_prop_ro(
           "name",
           [](PyNamedAttribute &self) {
             return mlirIdentifierStr(self.namedAttr.name);
           },
-          "The name of the NamedAttribute binding")
+          "The name of the `NamedAttribute` binding.")
       .def_prop_ro(
           "attr",
           [](PyNamedAttribute &self) { return self.namedAttr.attribute; },
           nb::keep_alive<0, 1>(), nb::sig("def attr(self) -> Attribute"),
-          "The underlying generic attribute of the NamedAttribute binding");
+          "The underlying generic attribute of the `NamedAttribute` binding.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyType.
@@ -4188,9 +4485,11 @@ void mlir::python::populateIRCore(nb::module_ &m) {
       // Delegate to the PyType copy constructor, which will also lifetime
       // extend the backing context which owns the MlirType.
       .def(nb::init<PyType &>(), nb::arg("cast_from_type"),
-           "Casts the passed type to the generic Type")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule)
+           "Casts the passed type to the generic `Type`.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule,
+                   "Gets a capsule wrapping the `MlirType`.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule,
+                  "Creates a Type from a capsule wrapping `MlirType`.")
       .def_static(
           "parse",
           [](std::string typeSpec,
@@ -4203,21 +4502,31 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return PyType(context.get()->getRef(), type).maybeDownCast();
           },
           nb::arg("asm"), nb::arg("context") = nb::none(),
-          kContextParseTypeDocstring)
+          R"(
+            Parses the assembly form of a type.
+
+            Returns a Type object or raises an `MLIRError` if the type cannot be parsed.
+
+            See also: https://mlir.llvm.org/docs/LangRef/#type-system)")
       .def_prop_ro(
           "context",
           [](PyType &self) -> nb::typed<nb::object, PyMlirContext> {
             return self.getContext().getObject();
           },
-          "Context that owns the Type")
-      .def("__eq__", [](PyType &self, PyType &other) { return self == other; })
+          "Context that owns the `Type`.")
+      .def(
+          "__eq__", [](PyType &self, PyType &other) { return self == other; },
+          "Compares two types for equality.")
       .def(
           "__eq__", [](PyType &self, nb::object &other) { return false; },
-          nb::arg("other").none())
-      .def("__hash__",
-           [](PyType &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+          nb::arg("other").none(),
+          "Compares type with non-type object (always returns False).")
+      .def(
+          "__hash__",
+          [](PyType &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the `Type`.")
       .def(
           "dump", [](PyType &self) { mlirTypeDump(self); }, kDumpDocstring)
       .def(
@@ -4228,60 +4537,81 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                           printAccum.getUserData());
             return printAccum.join();
           },
-          "Returns the assembly form of the type.")
-      .def("__repr__",
-           [](PyType &self) {
-             // Generally, assembly formats are not printed for __repr__ because
-             // this can cause exceptionally long debug output and exceptions.
-             // However, types are an exception as they typically have compact
-             // assembly forms and printing them is useful.
-             PyPrintAccumulator printAccum;
-             printAccum.parts.append("Type(");
-             mlirTypePrint(self, printAccum.getCallback(),
-                           printAccum.getUserData());
-             printAccum.parts.append(")");
-             return printAccum.join();
-           })
-      .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-           [](PyType &self) -> nb::typed<nb::object, PyType> {
-             return self.maybeDownCast();
-           })
-      .def_prop_ro("typeid", [](PyType &self) {
-        MlirTypeID mlirTypeID = mlirTypeGetTypeID(self);
-        if (!mlirTypeIDIsNull(mlirTypeID))
-          return PyTypeID(mlirTypeID);
-        auto origRepr = nb::cast<std::string>(nb::repr(nb::cast(self)));
-        throw nb::value_error(
-            (origRepr + llvm::Twine(" has no typeid.")).str().c_str());
-      });
+          "Returns the assembly form of the `Type`.")
+      .def(
+          "__repr__",
+          [](PyType &self) {
+            // Generally, assembly formats are not printed for __repr__ because
+            // this can cause exceptionally long debug output and exceptions.
+            // However, types are an exception as they typically have compact
+            // assembly forms and printing them is useful.
+            PyPrintAccumulator printAccum;
+            printAccum.parts.append("Type(");
+            mlirTypePrint(self, printAccum.getCallback(),
+                          printAccum.getUserData());
+            printAccum.parts.append(")");
+            return printAccum.join();
+          },
+          "Returns a string representation of the `Type`.")
+      .def(
+          MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+          [](PyType &self) -> nb::typed<nb::object, PyType> {
+            return self.maybeDownCast();
+          },
+          "Downcasts the Type to a more specific `Type` if possible.")
+      .def_prop_ro(
+          "typeid",
+          [](PyType &self) {
+            MlirTypeID mlirTypeID = mlirTypeGetTypeID(self);
+            if (!mlirTypeIDIsNull(mlirTypeID))
+              return PyTypeID(mlirTypeID);
+            auto origRepr = nb::cast<std::string>(nb::repr(nb::cast(self)));
+            throw nb::value_error(
+                (origRepr + llvm::Twine(" has no typeid.")).str().c_str());
+          },
+          "Returns the `TypeID` of the `Type`, or raises `ValueError` if "
+          "`Type` has no "
+          "`TypeID`.");
 
   //----------------------------------------------------------------------------
   // Mapping of PyTypeID.
   //----------------------------------------------------------------------------
   nb::class_<PyTypeID>(m, "TypeID")
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule)
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule,
+                   "Gets a capsule wrapping the `MlirTypeID`.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule,
+                  "Creates a `TypeID` from a capsule wrapping `MlirTypeID`.")
       // Note, this tests whether the underlying TypeIDs are the same,
       // not whether the wrapper MlirTypeIDs are the same, nor whether
       // the Python objects are the same (i.e., PyTypeID is a value type).
-      .def("__eq__",
-           [](PyTypeID &self, PyTypeID &other) { return self == other; })
-      .def("__eq__",
-           [](PyTypeID &self, const nb::object &other) { return false; })
+      .def(
+          "__eq__",
+          [](PyTypeID &self, PyTypeID &other) { return self == other; },
+          "Compares two `TypeID`s for equality.")
+      .def(
+          "__eq__",
+          [](PyTypeID &self, const nb::object &other) { return false; },
+          "Compares TypeID with non-TypeID object (always returns False).")
       // Note, this gives the hash value of the underlying TypeID, not the
       // hash value of the Python object, nor the hash value of the
       // MlirTypeID wrapper.
-      .def("__hash__", [](PyTypeID &self) {
-        return static_cast<size_t>(mlirTypeIDHashValue(self));
-      });
+      .def(
+          "__hash__",
+          [](PyTypeID &self) {
+            return static_cast<size_t>(mlirTypeIDHashValue(self));
+          },
+          "Returns the hash value of the `TypeID`.");
 
   //----------------------------------------------------------------------------
   // Mapping of Value.
   //----------------------------------------------------------------------------
   nb::class_<PyValue>(m, "Value")
-      .def(nb::init<PyValue &>(), nb::keep_alive<0, 1>(), nb::arg("value"))
-      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule)
-      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule)
+      .def(nb::init<PyValue &>(), nb::keep_alive<0, 1>(), nb::arg("value"),
+           "Creates a Value reference from another `Value`.")
+      .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule,
+                   "Gets a capsule wrapping the `MlirValue`.")
+      .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule,
+                  "Creates a `Value` from a capsule wrapping `MlirValue`.")
       .def_prop_ro(
           "context",
           [](PyValue &self) -> nb::typed<nb::object, PyMlirContext> {
@@ -4312,23 +4642,30 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             assert(false && "Value must be a block argument or an op result");
             return nb::none();
           },
-          // clang-format off
-          nb::sig("def owner(self) -> Operation | Block | None"))
-      // clang-format on
-      .def_prop_ro("uses",
-                   [](PyValue &self) {
-                     return PyOpOperandIterator(
-                         mlirValueGetFirstUse(self.get()));
-                   })
-      .def("__eq__",
-           [](PyValue &self, PyValue &other) {
-             return self.get().ptr == other.get().ptr;
-           })
-      .def("__eq__", [](PyValue &self, nb::object other) { return false; })
-      .def("__hash__",
-           [](PyValue &self) {
-             return static_cast<size_t>(llvm::hash_value(self.get().ptr));
-           })
+          "Returns the owner of the value (`Operation` for results, `Block` "
+          "for "
+          "arguments).")
+      .def_prop_ro(
+          "uses",
+          [](PyValue &self) {
+            return PyOpOperandIterator(mlirValueGetFirstUse(self.get()));
+          },
+          "Returns an iterator over uses of this value.")
+      .def(
+          "__eq__",
+          [](PyValue &self, PyValue &other) {
+            return self.get().ptr == other.get().ptr;
+          },
+          "Compares two values for pointer equality.")
+      .def(
+          "__eq__", [](PyValue &self, nb::object other) { return false; },
+          "Compares value with non-value object (always returns False).")
+      .def(
+          "__hash__",
+          [](PyValue &self) {
+            return static_cast<size_t>(llvm::hash_value(self.get().ptr));
+          },
+          "Returns the hash value of the value.")
       .def(
           "__str__",
           [](PyValue &self) {
@@ -4339,7 +4676,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             printAccum.parts.append(")");
             return printAccum.join();
           },
-          kValueDunderStrDocstring)
+          R"(
+            Returns the string form of the value.
+
+            If the value is a block argument, this is the assembly form of its type and the
+            position in the argument list. If the value is an operation result, this is
+            equivalent to printing the operation that produced it.
+          )")
       .def(
           "get_name",
           [](PyValue &self, bool useLocalScope, bool useNameLocAsPrefix) {
@@ -4359,7 +4702,16 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return printAccum.join();
           },
           nb::arg("use_local_scope") = false,
-          nb::arg("use_name_loc_as_prefix") = false)
+          nb::arg("use_name_loc_as_prefix") = false,
+          R"(
+            Returns the string form of value as an operand.
+
+            Args:
+              use_local_scope: Whether to use local scope for naming.
+              use_name_loc_as_prefix: Whether to use the location attribute (NameLoc) as prefix.
+
+            Returns:
+              The value's name as it appears in IR (e.g., `%0`, `%arg0`).)")
       .def(
           "get_name",
           [](PyValue &self, PyAsmState &state) {
@@ -4370,25 +4722,29 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                                     printAccum.getUserData());
             return printAccum.join();
           },
-          nb::arg("state"), kGetNameAsOperand)
-      .def_prop_ro("type",
-                   [](PyValue &self) -> nb::typed<nb::object, PyType> {
-                     return PyType(self.getParentOperation()->getContext(),
-                                   mlirValueGetType(self.get()))
-                         .maybeDownCast();
-                   })
+          nb::arg("state"),
+          "Returns the string form of value as an operand (i.e., the ValueID).")
+      .def_prop_ro(
+          "type",
+          [](PyValue &self) -> nb::typed<nb::object, PyType> {
+            return PyType(self.getParentOperation()->getContext(),
+                          mlirValueGetType(self.get()))
+                .maybeDownCast();
+          },
+          "Returns the type of the value.")
       .def(
           "set_type",
           [](PyValue &self, const PyType &type) {
-            return mlirValueSetType(self.get(), type);
+            mlirValueSetType(self.get(), type);
           },
-          nb::arg("type"))
+          nb::arg("type"), "Sets the type of the value.")
       .def(
           "replace_all_uses_with",
           [](PyValue &self, PyValue &with) {
             mlirValueReplaceAllUsesOfWith(self.get(), with.get());
           },
-          kValueReplaceAllUsesWithDocstring)
+          "Replace all uses of value with the new value, updating anything in "
+          "the IR that uses `self` to use the other value instead.")
       .def(
           "replace_all_uses_except",
           [](PyValue &self, PyValue &with, PyOperation &exception) {
@@ -4434,10 +4790,12 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           nb::arg("with_"), nb::arg("exceptions"),
           kValueReplaceAllUsesExceptDocstring)
-      .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-           [](PyValue &self) -> nb::typed<nb::object, PyValue> {
-             return self.maybeDownCast();
-           })
+      .def(
+          MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+          [](PyValue &self) -> nb::typed<nb::object, PyValue> {
+            return self.maybeDownCast();
+          },
+          "Downcasts the `Value` to a more specific kind if possible.")
       .def_prop_ro(
           "location",
           [](MlirValue self) {
@@ -4445,7 +4803,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                 PyMlirContext::forContext(mlirValueGetContext(self)),
                 mlirValueGetLocation(self));
           },
-          "Returns the source location the value");
+          "Returns the source location of the value.");
 
   PyBlockArgument::bind(m);
   PyOpResult::bind(m);
@@ -4453,43 +4811,105 @@ void mlir::python::populateIRCore(nb::module_ &m) {
 
   nb::class_<PyAsmState>(m, "AsmState")
       .def(nb::init<PyValue &, bool>(), nb::arg("value"),
-           nb::arg("use_local_scope") = false)
+           nb::arg("use_local_scope") = false,
+           R"(
+             Creates an `AsmState` for consistent SSA value naming.
+
+             Args:
+               value: The value to create state for.
+               use_local_scope: Whether to use local scope for naming.)")
       .def(nb::init<PyOperationBase &, bool>(), nb::arg("op"),
-           nb::arg("use_local_scope") = false);
+           nb::arg("use_local_scope") = false,
+           R"(
+             Creates an AsmState for consistent SSA value naming.
+
+             Args:
+               op: The operation to create state for.
+               use_local_scope: Whether to use local scope for naming.)");
 
   //----------------------------------------------------------------------------
   // Mapping of SymbolTable.
   //----------------------------------------------------------------------------
   nb::class_<PySymbolTable>(m, "SymbolTable")
-      .def(nb::init<PyOperationBase &>())
-      .def("__getitem__",
-           [](PySymbolTable &self,
-              const std::string &name) -> nb::typed<nb::object, PyOpView> {
-             return self.dunderGetItem(name);
-           })
-      .def("insert", &PySymbolTable::insert, nb::arg("operation"))
-      .def("erase", &PySymbolTable::erase, nb::arg("operation"))
-      .def("__delitem__", &PySymbolTable::dunderDel)
-      .def("__contains__",
-           [](PySymbolTable &table, const std::string &name) {
-             return !mlirOperationIsNull(mlirSymbolTableLookup(
-                 table, mlirStringRefCreate(name.data(), name.length())));
-           })
+      .def(nb::init<PyOperationBase &>(),
+           R"(
+             Creates a symbol table for an operation.
+
+             Args:
+               operation: The `Operation` that defines a symbol table (e.g., a `ModuleOp`).
+
+             Raises:
+               TypeError: If the operation is not a symbol table.)")
+      .def(
+          "__getitem__",
+          [](PySymbolTable &self,
+             const std::string &name) -> nb::typed<nb::object, PyOpView> {
+            return self.dunderGetItem(name);
+          },
+          R"(
+            Looks up a symbol by name in the symbol table.
+
+            Args:
+              name: The name of the symbol to look up.
+
+            Returns:
+              The operation defining the symbol.
+
+            Raises:
+              KeyError: If the symbol is not found.)")
+      .def("insert", &PySymbolTable::insert, nb::arg("operation"),
+           R"(
+             Inserts a symbol operation into the symbol table.
+
+             Args:
+               operation: An operation with a symbol name to insert.
+
+             Returns:
+               The symbol name attribute of the inserted operation.
+
+             Raises:
+               ValueError: If the operation does not have a symbol name.)")
+      .def("erase", &PySymbolTable::erase, nb::arg("operation"),
+           R"(
+             Erases a symbol operation from the symbol table.
+
+             Args:
+               operation: The symbol operation to erase.
+
+             Note:
+               The operation is also erased from the IR and invalidated.)")
+      .def("__delitem__", &PySymbolTable::dunderDel,
+           "Deletes a symbol by name from the symbol table.")
+      .def(
+          "__contains__",
+          [](PySymbolTable &table, const std::string &name) {
+            return !mlirOperationIsNull(mlirSymbolTableLookup(
+                table, mlirStringRefCreate(name.data(), name.length())));
+          },
+          "Checks if a symbol with the given name exists in the table.")
       // Static helpers.
       .def_static("set_symbol_name", &PySymbolTable::setSymbolName,
-                  nb::arg("symbol"), nb::arg("name"))
+                  nb::arg("symbol"), nb::arg("name"),
+                  "Sets the symbol name for a symbol operation.")
       .def_static("get_symbol_name", &PySymbolTable::getSymbolName,
-                  nb::arg("symbol"))
+                  nb::arg("symbol"),
+                  "Gets the symbol name from a symbol operation.")
       .def_static("get_visibility", &PySymbolTable::getVisibility,
-                  nb::arg("symbol"))
+                  nb::arg("symbol"),
+                  "Gets the visibility attribute of a symbol operation.")
       .def_static("set_visibility", &PySymbolTable::setVisibility,
-                  nb::arg("symbol"), nb::arg("visibility"))
+                  nb::arg("symbol"), nb::arg("visibility"),
+                  "Sets the visibility attribute of a symbol operation.")
       .def_static("replace_all_symbol_uses",
                   &PySymbolTable::replaceAllSymbolUses, nb::arg("old_symbol"),
-                  nb::arg("new_symbol"), nb::arg("from_op"))
+                  nb::arg("new_symbol"), nb::arg("from_op"),
+                  "Replaces all uses of a symbol with a new symbol name within "
+                  "the given operation.")
       .def_static("walk_symbol_tables", &PySymbolTable::walkSymbolTables,
                   nb::arg("from_op"), nb::arg("all_sym_uses_visible"),
-                  nb::arg("callback"));
+                  nb::arg("callback"),
+                  "Walks symbol tables starting from an operation with a "
+                  "callback function.");
 
   // Container bindings.
   PyBlockArgumentList::bind(m);

From cd68056d13adcdabc5f42a05eff142b2ababdfd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gergely=20B=C3=A1lint?= <gergely.balint@arm.com>
Date: Mon, 10 Nov 2025 16:45:39 +0100
Subject: [PATCH 20/28] [BOLT] Simplify RAState helpers (NFCI) (#162820)

- unify isRAStateSigned and isRAStateUnsigned to a common getRAState,
- unify setRASigned and setRAUnsigned into setRAState(MCInst, bool),
- update users of these to match the new implementations.
---
 bolt/include/bolt/Core/MCPlusBuilder.h      | 19 +++-----
 bolt/lib/Core/MCPlusBuilder.cpp             | 27 +++++------
 bolt/lib/Passes/InsertNegateRAStatePass.cpp | 53 +++++++++++++++------
 bolt/lib/Passes/MarkRAStates.cpp            | 12 +----
 4 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 5e349cd69fb43..295558c0d3af8 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1371,20 +1371,13 @@ class MCPlusBuilder {
   /// Return true if \p Inst has RestoreState annotation.
   bool hasRestoreState(const MCInst &Inst) const;
 
-  /// Stores RA Signed annotation on \p Inst.
-  void setRASigned(MCInst &Inst) const;
+  /// Sets kRASigned or kRAUnsigned annotation on \p Inst.
+  /// Fails if \p Inst has either annotation already set.
+  void setRAState(MCInst &Inst, bool State) const;
 
-  /// Return true if \p Inst has Signed RA annotation.
-  bool isRASigned(const MCInst &Inst) const;
-
-  /// Stores RA Unsigned annotation on \p Inst.
-  void setRAUnsigned(MCInst &Inst) const;
-
-  /// Return true if \p Inst has Unsigned RA annotation.
-  bool isRAUnsigned(const MCInst &Inst) const;
-
-  /// Return true if \p Inst doesn't have any annotation related to RA state.
-  bool isRAStateUnknown(const MCInst &Inst) const;
+  /// Return true if \p Inst has kRASigned annotation, false if it has
+  /// kRAUnsigned annotation, and std::nullopt if neither annotation is set.
+  std::optional<bool> getRAState(const MCInst &Inst) const;
 
   /// Return true if the instruction is a call with an exception handling info.
   virtual bool isInvoke(const MCInst &Inst) const {
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index e96de80bfa701..0cb4ba1ebfbd7 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -186,26 +186,21 @@ bool MCPlusBuilder::hasRestoreState(const MCInst &Inst) const {
   return hasAnnotation(Inst, MCAnnotation::kRestoreState);
 }
 
-void MCPlusBuilder::setRASigned(MCInst &Inst) const {
+void MCPlusBuilder::setRAState(MCInst &Inst, bool State) const {
   assert(!hasAnnotation(Inst, MCAnnotation::kRASigned));
-  setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true);
-}
-
-bool MCPlusBuilder::isRASigned(const MCInst &Inst) const {
-  return hasAnnotation(Inst, MCAnnotation::kRASigned);
-}
-
-void MCPlusBuilder::setRAUnsigned(MCInst &Inst) const {
   assert(!hasAnnotation(Inst, MCAnnotation::kRAUnsigned));
-  setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true);
+  if (State)
+    setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true);
+  else
+    setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true);
 }
 
-bool MCPlusBuilder::isRAUnsigned(const MCInst &Inst) const {
-  return hasAnnotation(Inst, MCAnnotation::kRAUnsigned);
-}
-
-bool MCPlusBuilder::isRAStateUnknown(const MCInst &Inst) const {
-  return !(isRAUnsigned(Inst) || isRASigned(Inst));
+std::optional<bool> MCPlusBuilder::getRAState(const MCInst &Inst) const {
+  if (hasAnnotation(Inst, MCAnnotation::kRASigned))
+    return true;
+  if (hasAnnotation(Inst, MCAnnotation::kRAUnsigned))
+    return false;
+  return std::nullopt;
 }
 
 std::optional<MCLandingPad> MCPlusBuilder::getEHInfo(const MCInst &Inst) const {
diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
index 33664e1160a7b..775b7795e77c5 100644
--- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp
+++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp
@@ -21,7 +21,12 @@ using namespace llvm;
 namespace llvm {
 namespace bolt {
 
+static bool PassFailed = false;
+
 void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
+  if (PassFailed)
+    return;
+
   BinaryContext &BC = BF.getBinaryContext();
 
   if (BF.getState() == BinaryFunction::State::Empty)
@@ -39,7 +44,7 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
   for (FunctionFragment &FF : BF.getLayout().fragments()) {
     coverFunctionFragmentStart(BF, FF);
     bool FirstIter = true;
-    MCInst PrevInst;
+    bool PrevRAState = false;
     // As this pass runs after function splitting, we should only check
     // consecutive instructions inside FunctionFragments.
     for (BinaryBasicBlock *BB : FF) {
@@ -47,18 +52,23 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) {
         MCInst &Inst = *It;
         if (BC.MIB->isCFI(Inst))
           continue;
+        auto RAState = BC.MIB->getRAState(Inst);
+        if (!RAState) {
+          BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
+                    << " in function " << BF.getPrintName() << "\n";
+          PassFailed = true;
+          return;
+        }
         if (!FirstIter) {
           // Consecutive instructions with different RAState means we need to
           // add a OpNegateRAState.
-          if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) ||
-              (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) {
+          if (*RAState != PrevRAState)
             It = BF.addCFIInstruction(
                 BB, It, MCCFIInstruction::createNegateRAState(nullptr));
-          }
         } else {
           FirstIter = false;
         }
-        PrevInst = *It;
+        PrevRAState = *RAState;
       }
     }
   }
@@ -81,10 +91,17 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF,
       });
   // If a function is already split in the input, the first FF can also start
   // with Signed state. This covers that scenario as well.
-  if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) {
-    BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(),
-                         MCCFIInstruction::createNegateRAState(nullptr));
+  auto II = (*FirstNonEmpty)->getFirstNonPseudo();
+  auto RAState = BC.MIB->getRAState(*II);
+  if (!RAState) {
+    BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates "
+              << " in function " << BF.getPrintName() << "\n";
+    PassFailed = true;
+    return;
   }
+  if (*RAState)
+    BF.addCFIInstruction(*FirstNonEmpty, II,
+                         MCCFIInstruction::createNegateRAState(nullptr));
 }
 
 void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
@@ -96,15 +113,21 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) {
       if (BC.MIB->isCFI(Inst))
         continue;
 
-      if (!FirstIter && BC.MIB->isRAStateUnknown(Inst)) {
-        if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isPSignOnLR(PrevInst)) {
-          BC.MIB->setRASigned(Inst);
-        } else if (BC.MIB->isRAUnsigned(PrevInst) ||
-                   BC.MIB->isPAuthOnLR(PrevInst)) {
-          BC.MIB->setRAUnsigned(Inst);
+      auto RAState = BC.MIB->getRAState(Inst);
+      if (!FirstIter && !RAState) {
+        if (BC.MIB->isPSignOnLR(PrevInst))
+          RAState = true;
+        else if (BC.MIB->isPAuthOnLR(PrevInst))
+          RAState = false;
+        else {
+          auto PrevRAState = BC.MIB->getRAState(PrevInst);
+          RAState = PrevRAState ? *PrevRAState : false;
         }
+        BC.MIB->setRAState(Inst, *RAState);
       } else {
         FirstIter = false;
+        if (!RAState)
+          BC.MIB->setRAState(Inst, BF.getInitialRAState());
       }
       PrevInst = Inst;
     }
@@ -135,6 +158,8 @@ Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) {
             << " functions "
             << format("(%.2lf%%).\n", (100.0 * FunctionsModified) /
                                           BC.getBinaryFunctions().size());
+  if (PassFailed)
+    return createFatalBOLTError("");
   return Error::success();
 }
 
diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp
index b262d66732b7d..51075be0e1ac2 100644
--- a/bolt/lib/Passes/MarkRAStates.cpp
+++ b/bolt/lib/Passes/MarkRAStates.cpp
@@ -72,9 +72,6 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return false;
         }
-        // The signing instruction itself is unsigned, the next will be
-        // signed.
-        BC.MIB->setRAUnsigned(Inst);
       } else if (BC.MIB->isPAuthOnLR(Inst)) {
         if (!RAState) {
           // RA authenticating instructions should only follow signed RA state.
@@ -86,15 +83,10 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) {
           BF.setIgnored();
           return false;
         }
-        // The authenticating instruction itself is signed, but the next will be
-        // unsigned.
-        BC.MIB->setRASigned(Inst);
-      } else if (RAState) {
-        BC.MIB->setRASigned(Inst);
-      } else {
-        BC.MIB->setRAUnsigned(Inst);
       }
 
+      BC.MIB->setRAState(Inst, RAState);
+
       // Updating RAState. All updates are valid from the next instruction.
       // Because the same instruction can have remember and restore, the order
       // here is relevant. This is the reason to loop over Annotations instead

From 51815b18d73e9a9ed021c789319cffdf1f735f3c Mon Sep 17 00:00:00 2001
From: ganenkokb-yandex <160136233+ganenkokb-yandex@users.noreply.github.com>
Date: Mon, 10 Nov 2025 18:45:54 +0300
Subject: [PATCH 21/28] [Clang][ASTImporter] Implement AST import for
 CXXParenListInitExpr, SubstNonTypeTemplateParmPackExpr, PseudoObjectExpr
 (#160904)

Add new visit functions to ASTImporter for CXXParenListInitExpr,
SubstNonTypeTemplateParmPackExpr and PseudoObjectExpr.
On CTU analysis there are lot of "cannot import unsupported AST node"
for CXXParenListInitExpr, SubstNonTypeTemplateParmPackExpr and
PseudoObjectExpr. Problem occurred after full support of Concepts in
importer.
---
 clang/lib/AST/ASTImporter.cpp           | 48 ++++++++++++++++++
 clang/unittests/AST/ASTImporterTest.cpp | 66 +++++++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index bf51c3e42719c..735f3157b694e 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -696,6 +696,10 @@ namespace clang {
     ExpectedStmt VisitCXXFoldExpr(CXXFoldExpr *E);
     ExpectedStmt VisitRequiresExpr(RequiresExpr* E);
     ExpectedStmt VisitConceptSpecializationExpr(ConceptSpecializationExpr* E);
+    ExpectedStmt
+    VisitSubstNonTypeTemplateParmPackExpr(SubstNonTypeTemplateParmPackExpr *E);
+    ExpectedStmt VisitPseudoObjectExpr(PseudoObjectExpr *E);
+    ExpectedStmt VisitCXXParenListInitExpr(CXXParenListInitExpr *E);
 
     // Helper for chaining together multiple imports. If an error is detected,
     // subsequent imports will return default constructed nodes, so that failure
@@ -9273,6 +9277,50 @@ ASTNodeImporter::VisitConceptSpecializationExpr(ConceptSpecializationExpr *E) {
       const_cast<ImplicitConceptSpecializationDecl *>(CSD), &Satisfaction);
 }
 
+ExpectedStmt ASTNodeImporter::VisitSubstNonTypeTemplateParmPackExpr(
+    SubstNonTypeTemplateParmPackExpr *E) {
+  Error Err = Error::success();
+  auto ToType = importChecked(Err, E->getType());
+  auto ToPackLoc = importChecked(Err, E->getParameterPackLocation());
+  auto ToArgPack = importChecked(Err, E->getArgumentPack());
+  auto ToAssociatedDecl = importChecked(Err, E->getAssociatedDecl());
+  if (Err)
+    return std::move(Err);
+
+  return new (Importer.getToContext()) SubstNonTypeTemplateParmPackExpr(
+      ToType, E->getValueKind(), ToPackLoc, ToArgPack, ToAssociatedDecl,
+      E->getIndex(), E->getFinal());
+}
+
+ExpectedStmt ASTNodeImporter::VisitPseudoObjectExpr(PseudoObjectExpr *E) {
+  SmallVector<Expr *, 4> ToSemantics(E->getNumSemanticExprs());
+  if (Error Err = ImportContainerChecked(E->semantics(), ToSemantics))
+    return std::move(Err);
+  auto ToSyntOrErr = import(E->getSyntacticForm());
+  if (!ToSyntOrErr)
+    return ToSyntOrErr.takeError();
+  return PseudoObjectExpr::Create(Importer.getToContext(), *ToSyntOrErr,
+                                  ToSemantics, E->getResultExprIndex());
+}
+
+ExpectedStmt
+ASTNodeImporter::VisitCXXParenListInitExpr(CXXParenListInitExpr *E) {
+  Error Err = Error::success();
+  auto ToType = importChecked(Err, E->getType());
+  auto ToInitLoc = importChecked(Err, E->getInitLoc());
+  auto ToBeginLoc = importChecked(Err, E->getBeginLoc());
+  auto ToEndLoc = importChecked(Err, E->getEndLoc());
+  if (Err)
+    return std::move(Err);
+
+  SmallVector<Expr *, 4> ToArgs(E->getInitExprs().size());
+  if (Error Err = ImportContainerChecked(E->getInitExprs(), ToArgs))
+    return std::move(Err);
+  return CXXParenListInitExpr::Create(Importer.getToContext(), ToArgs, ToType,
+                                      E->getUserSpecifiedInitExprs().size(),
+                                      ToInitLoc, ToBeginLoc, ToEndLoc);
+}
+
 Error ASTNodeImporter::ImportOverriddenMethods(CXXMethodDecl *ToMethod,
                                                CXXMethodDecl *FromMethod) {
   Error ImportErrors = Error::success();
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4c7ea5e338a13..3cab4c600b1b1 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3300,6 +3300,72 @@ TEST_P(ImportExpr, ConceptNestedNonInstantiationDependentRequirement) {
              conceptDecl(has(requiresExpr(has(requiresExprBodyDecl())))));
 }
 
+TEST_P(ImportExpr, ImportSubstNonTypeTemplateParmPackExpr) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+    template<auto ...> struct X {};
+    template<typename ...> struct Z {};
+
+    template<int ...N> struct E {
+      template<int ...M> using B = Z<X<N, M>...>;
+      template<int M1, int M2> E(B<M1, M2>);
+    };
+    using declToImport = E<1, 3>;
+  )";
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             typedefNameDecl(hasName("declToImport")));
+}
+
+TEST_P(ImportExpr, ImportCXXParenListInitExpr) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+    struct Node {
+      int val;
+      double d;
+    };
+    Node* declToImport() { return new Node(2, 3.14); }
+  )";
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionDecl(hasName("declToImport")));
+}
+
+TEST_P(ImportExpr, ImportPseudoObjectExpr) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+  namespace std {
+    struct strong_ordering {
+      int n;
+      constexpr operator int() const { return n; }
+      static const strong_ordering less, equal, greater;
+    };
+    constexpr strong_ordering strong_ordering::less{-1},
+        strong_ordering::equal{0}, strong_ordering::greater{1};
+  }
+
+  struct A {
+    std::strong_ordering operator<=>(const A&) const;
+  };
+  struct B {
+    bool operator==(const B&) const;
+    bool operator<(const B&) const;
+  };
+
+  template<typename T> struct Cmp : T {
+    std::strong_ordering operator<=>(const Cmp&) const = default;
+  };
+
+  void use(...);
+  void declToImport() {
+    use(
+      Cmp<A>() <=> Cmp<A>(),
+      Cmp<B>() <=> Cmp<B>()
+    );
+  }
+  )";
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionDecl(hasName("declToImport")));
+}
+
 class ImportImplicitMethods : public ASTImporterOptionSpecificTestBase {
 public:
   static constexpr auto DefaultCode = R"(

From 9625cf6cc0e3e530ea0bed971d85b363f77c49d8 Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Mon, 10 Nov 2025 15:49:53 +0000
Subject: [PATCH 22/28] [BAZEL] Add missing dependency on /llvm:Support from
 XeGPUTransformOps (#167332)

Fixes 1553f90f93d30b41457097cf274c3791b182f316
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index a4ea627fb3d16..7066f498c7d49 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3888,6 +3888,7 @@ cc_library(
         ":XeGPUDialect",
         ":XeGPUTransformOpsIncGen",
         ":XeGPUUtils",
+        "//llvm:Support",
     ],
 )
 

From 741ba8209c1f9bd5b1a145d9c137f5e18bfffb84 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 07:56:33 -0800
Subject: [PATCH 23/28] AMDGPU: Add baseline test for known bits of
 AssertNoFPClass (#167288)

---
 .../AMDGPU/compute-known-bits-nofpclass.ll    | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll

diff --git a/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll
new file mode 100644
index 0000000000000..d440d58246333
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+define i32 @known_positive(float nofpclass(nan ninf nzero nsub nnorm) %signbit.zero) #0 {
+; CHECK-LABEL: known_positive:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.zero to i32
+  %and = and i32 %cast, 2147483647
+  ret i32 %and
+}
+
+define i32 @known_positive_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %signbit.zero) #0 {
+; CHECK-LABEL: known_positive_maybe_nan:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.zero to i32
+  %and = and i32 %cast, 2147483647
+  ret i32 %and
+}
+
+define i32 @known_negative(float nofpclass(nan pinf pzero psub pnorm) %signbit.one) #0 {
+; CHECK-LABEL: known_negative:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.one to i32
+  %or = or i32 %cast, -2147483648
+  ret i32 %or
+}
+
+define i32 @known_negative_maybe_nan(float nofpclass(pinf pzero psub pnorm) %signbit.one) #0 {
+; CHECK-LABEL: known_negative_maybe_nan:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %cast = bitcast float %signbit.one to i32
+  %or = or i32 %cast, -2147483648
+  ret i32 %or
+}
+
+attributes #0 = { nounwind }

From 726c049e56e2af81d617b59d0ef282787e34ceb8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 08:04:49 -0800
Subject: [PATCH 24/28] AMDGPU: Add baseline test for nofpclass on call results
 (#167263)

---
 llvm/test/CodeGen/AMDGPU/nofpclass-call.ll | 199 +++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/nofpclass-call.ll

diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
new file mode 100644
index 0000000000000..1861f02ec8b1c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll
@@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Check that nofpclass attributes on call returns are used in
+; selectiondag.
+
+define internal float @func_f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile float, ptr addrspace(1) %ptr
+  ret float %ld
+}
+
+define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v4, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_f32@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_f32@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v4, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_max_f32_e32 v1, v3, v3
+; CHECK-NEXT:    v_max_f32_e32 v0, v0, v0
+; CHECK-NEXT:    v_min_f32_e32 v0, v1, v0
+; CHECK-NEXT:    v_readlane_b32 s31, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v4, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr)
+  %min = call float @llvm.minnum.f32(float %call0, float %call1)
+  ret float %min
+}
+
+define internal <2 x float> @func_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_v2f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile <2 x float>, ptr addrspace(1) %ptr
+  ret <2 x float> %ld
+}
+
+define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_v2f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_v2f32@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_max_f32_e32 v2, v4, v4
+; CHECK-NEXT:    v_max_f32_e32 v0, v0, v0
+; CHECK-NEXT:    v_min_f32_e32 v0, v2, v0
+; CHECK-NEXT:    v_max_f32_e32 v2, v5, v5
+; CHECK-NEXT:    v_max_f32_e32 v1, v1, v1
+; CHECK-NEXT:    v_min_f32_e32 v1, v2, v1
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr)
+  %min = call <2 x float> @llvm.minnum.v2f32(<2 x float> %call0, <2 x float> %call1)
+  ret <2 x float> %min
+}
+
+define internal double @func_f64(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: func_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %ld = load volatile double, ptr addrspace(1) %ptr
+  ret double %ld
+}
+
+define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: call_nofpclass_funcs_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s18, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, func_f64@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, func_f64@rel32@hi+12
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v5, v0
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, v4
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; CHECK-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
+; CHECK-NEXT:    v_min_f64 v[0:1], v[2:3], v[0:1]
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_mov_b32 s33, s18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr)
+  %call1 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr)
+  %min = call double @llvm.minnum.f64(double %call0, double %call1)
+  ret double %min
+}
+
+define float @call_nofpclass_intrinsic_f32(float %x, float %y, float %z) {
+; CHECK-LABEL: call_nofpclass_intrinsic_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_sqrt_f32_e32 v0, v0
+; CHECK-NEXT:    v_sqrt_f32_e32 v1, v1
+; CHECK-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %x)
+  %call1 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %y)
+  %lt = fcmp olt float %call0, %call1
+  %min = select nsz i1 %lt, float %call0, float %call1
+  ret float %min
+}
+
+define <2 x half> @call_nofpclass_intrinsic_v2f16(float %x, float %y, float %z, float %w) {
+; CHECK-LABEL: call_nofpclass_intrinsic_v2f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
+; CHECK-NEXT:    v_cvt_pkrtz_f16_f32 v1, v2, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; CHECK-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT:    v_cmp_lt_f16_e32 vcc, v3, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CHECK-NEXT:    s_mov_b32 s4, 0x5040100
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, s4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %call0 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  %call1 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %z, float %w)
+  %lt = fcmp olt <2 x half> %call0, %call1
+  %min = select nsz <2 x i1> %lt, <2 x half> %call0, <2 x half> %call1
+  ret <2 x half> %min
+}

From 54053cf95d4d076f738c1a3e0ff38bd62a46ce32 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 10 Nov 2025 08:01:51 -0800
Subject: [PATCH 25/28] AMDGPU: Add baseline tests for copysign with known
 signmask input (#167265)

---
 .../AMDGPU/copysign-simplify-demanded-bits.ll | 108 +++++++++
 .../AMDGPU/copysign-to-disjoint-or-combine.ll | 208 ++++++++++++++++++
 2 files changed, 316 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll

diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index f5227eed458d6..0be2b5c70c93b 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -370,4 +370,112 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
   ret float %pow_sign1
 }
 
+define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
+; GFX9-LABEL: test_pow_fast_f64integral_y:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s16, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-NEXT:    v_writelane_b32 v43, s16, 15
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v43, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v43, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v43, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v43, s48, 8
+; GFX9-NEXT:    v_writelane_b32 v43, s49, 9
+; GFX9-NEXT:    v_writelane_b32 v43, s50, 10
+; GFX9-NEXT:    v_writelane_b32 v43, s51, 11
+; GFX9-NEXT:    s_addk_i32 s32, 0x800
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v43, s52, 12
+; GFX9-NEXT:    v_mov_b32_e32 v42, v1
+; GFX9-NEXT:    v_writelane_b32 v43, s53, 13
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, _Z4log2d@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12
+; GFX9-NEXT:    v_writelane_b32 v43, s54, 14
+; GFX9-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-NEXT:    v_mov_b32_e32 v41, v2
+; GFX9-NEXT:    s_mov_b32 s50, s15
+; GFX9-NEXT:    s_mov_b32 s51, s14
+; GFX9-NEXT:    s_mov_b32 s52, s13
+; GFX9-NEXT:    s_mov_b32 s53, s12
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX9-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GFX9-NEXT:    s_brev_b32 s54, -2
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, _Z4exp2d@rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, _Z4exp2d@rel32@hi+12
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT:    s_mov_b32 s12, s53
+; GFX9-NEXT:    s_mov_b32 s13, s52
+; GFX9-NEXT:    s_mov_b32 s14, s51
+; GFX9-NEXT:    s_mov_b32 s15, s50
+; GFX9-NEXT:    v_mov_b32_e32 v31, v40
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; GFX9-NEXT:    v_and_b32_e32 v2, v2, v42
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_bfi_b32 v1, s54, v1, v2
+; GFX9-NEXT:    v_readlane_b32 s54, v43, 14
+; GFX9-NEXT:    v_readlane_b32 s53, v43, 13
+; GFX9-NEXT:    v_readlane_b32 s52, v43, 12
+; GFX9-NEXT:    v_readlane_b32 s51, v43, 11
+; GFX9-NEXT:    v_readlane_b32 s50, v43, 10
+; GFX9-NEXT:    v_readlane_b32 s49, v43, 9
+; GFX9-NEXT:    v_readlane_b32 s48, v43, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v43, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v43, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v43, 15
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %fabs = call fast double @llvm.fabs.f64(double %x)
+  %log2 = call fast double @_Z4log2d(double %fabs)
+  %pownI2F = sitofp i32 %y.i to double
+  %ylogx = fmul fast double %log2, %pownI2F
+  %exp2 = call fast nofpclass(nan ninf nzero nsub nnorm) double @_Z4exp2d(double %ylogx)
+  %ytou = zext i32 %y.i to i64
+  %yeven = shl i64 %ytou, 63
+  %x.i64 = bitcast double %x to i64
+  %pow_sign = and i64 %yeven, %x.i64
+  %pow_sign.f64 = bitcast i64 %pow_sign to double
+  %pow_sign1 = call fast double @llvm.copysign.f64(double %exp2, double %pow_sign.f64)
+  ret double %pow_sign1
+}
+
+declare hidden double @_Z4exp2d(double) #1
+declare hidden double @_Z4log2d(double) #1
+
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { norecurse nounwind memory(read) }
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
new file mode 100644
index 0000000000000..b99b64316b62f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
@@ -0,0 +1,208 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Negative test, don't know %x is positive
+define half @copysign_known_signmask_f16(half %x, i16 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i16 %sign, 15
+  %signmask.bitcast = bitcast i16 %signmask to half
+  %result = call half @llvm.copysign.f16(half %x, half %signmask.bitcast)
+  ret half %result
+}
+
+; Negative test, don't know %x is positive
+define float @copysign_known_signmask_f32(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %x, float %signmask.bitcast)
+  ret float %result
+}
+
+; Negative test, don't know %x is positive
+define double @copysign_known_signmask_f64(double %x, i64 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i64 %sign, 63
+  %signmask.bitcast = bitcast i64 %signmask to double
+  %result = call double @llvm.copysign.f64(double %x, double %signmask.bitcast)
+  ret double %result
+}
+
+; Negative test, don't know %x is positive
+define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+; Negative test, don't know %x is positive
+define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero(float nofpclass(nan ninf nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i16 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f16_known_positive_mag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i16 %sign, 15
+  %signmask.bitcast = bitcast i16 %signmask to half
+  %result = call half @llvm.copysign.f16(half %sign.bit.known.zero, half %signmask.bitcast)
+  ret half %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i64 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f64_known_positive_mag:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i64 %sign, 63
+  %signmask.bitcast = bitcast i64 %signmask to double
+  %result = call double @llvm.copysign.f64(double %sign.bit.known.zero, double %signmask.bitcast)
+  ret double %result
+}
+
+; exp always returns a positive result, excluding the unknown nan sign
+; bit.
+define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2aeac50
+; GFX9-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x114b4ea4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x)
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_not_b32_e32 v2, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %x, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    v_not_b32_e32 v2, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast)
+  ret float %result
+}
+
+define float @copysign_known_signmask_f32_known_positive_mag_through_fence(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) {
+; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag_through_fence:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT:    ;ARITH_FENCE
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %signmask = shl i32 %sign, 31
+  %signmask.bitcast = bitcast i32 %signmask to float
+  %fence = call float @llvm.arithmetic.fence.f32(float %sign.bit.known.zero)
+  %result = call float @llvm.copysign.f32(float %fence, float %signmask.bitcast)
+  ret float %result
+}

From b9e22ccd95094ebf87e916d4e5bf8992204e927e Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 10 Nov 2025 17:10:32 +0100
Subject: [PATCH 26/28] [Flang][driver] Do not emit -latomic on link line on
 Windows (#164648)

Flang on Windows added `-latomic` to the link line. This library does
not exist on Windows and the linker gives a warning.
---
 clang/lib/Driver/ToolChain.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index eea5c2f7f4a6a..7f70f1c444eb9 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -851,8 +851,11 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
                    options::OPT_fno_openmp, false)) {
     Driver::OpenMPRuntimeKind OMPRuntime = getDriver().getOpenMPRuntime(Args);
     ToolChain::RuntimeLibType RuntimeLib = GetRuntimeLibType(Args);
-    if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc)
+    if ((OMPRuntime == Driver::OMPRT_OMP &&
+         RuntimeLib == ToolChain::RLT_Libgcc) &&
+        !getTriple().isKnownWindowsMSVCEnvironment()) {
       CmdArgs.push_back("-latomic");
+    }
   }
 }
 

From 38cade7cc6e599bb6bc8f0af98eb91cba9ceb5df Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Mon, 10 Nov 2025 10:11:53 -0600
Subject: [PATCH 27/28] [PGO][Offload] Fix missing names bug in GPU PGO
 (#166444)

After #163011 was merged, the tests in
[`offload/test/offloading/gpupgo`](https://github.com/llvm/llvm-project/compare/main...EthanLuisMcDonough:llvm-project:gpupgo-names-fix-pr?expand=1#diff-f769f6cebd25fa527bd1c1150cc64eb585c41cb8a8b325c2bc80c690e47506a1)
broke because the offload plugins were no longer able to find
`__llvm_prf_nm`. This pull request explicitly makes `__llvm_prf_nm`
visible to the host on GPU targets and reverses the changes made in
f7e9968a5ba99521e6e51161f789f0cc1745193f.
---
 llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 4 ++++
 offload/test/offloading/gpupgo/pgo_atomic_teams.c      | 1 -
 offload/test/offloading/gpupgo/pgo_atomic_threads.c    | 1 -
 offload/test/offloading/gpupgo/pgo_device_and_host.c   | 1 -
 offload/test/offloading/gpupgo/pgo_device_only.c       | 1 -
 5 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index b5548d4f24a2f..8c8d16a6e3d25 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1944,6 +1944,10 @@ void InstrLowerer::emitNameData() {
   NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
+  if (isGPUProfTarget(M)) {
+    NamesVar->setLinkage(GlobalValue::ExternalLinkage);
+    NamesVar->setVisibility(GlobalValue::ProtectedVisibility);
+  }
 
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
index 42d8ae43beba1..b3b72db080392 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
@@ -18,7 +18,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
index 09a4dc1577822..440a6b533317d 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
@@ -18,7 +18,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index c53e69a25e50d..3e95791ce9a50 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -50,7 +50,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int main() {
   int host_var = 0;
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 644df6e7b0339..2939af613b6dd 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -16,7 +16,6 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
-// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }

From 26b4ac0fd291517048ed2fcb49a94116aecf7282 Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ron.lieberman@amd.com>
Date: Mon, 10 Nov 2025 15:18:41 -0600
Subject: [PATCH 28/28] Revert "[OMPIRBuilder] Fix addrspace of internal
 critical section lock (#166459)"

This reverts commit c17a839d8335ae75447221adf62f7993e575a913.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 25 +++++++------------
 clang/test/OpenMP/force-usm.c                 |  2 +-
 .../OpenMP/spirv_target_codegen_basic.cpp     |  6 -----
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 13 +++++-----
 revert_patches.txt                            |  3 +++
 6 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index b5d1afdb11805..68a83561d7cb1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2024,29 +2024,22 @@ void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
   // Prepare arguments and build a call to __kmpc_critical
   if (!CGF.HaveInsertPoint())
     return;
-  llvm::FunctionCallee RuntimeFcn = OMPBuilder.getOrCreateRuntimeFunction(
-      CGM.getModule(),
-      Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical);
-  llvm::Value *LockVar = getCriticalRegionLock(CriticalName);
-  unsigned LockVarArgIdx = 2;
-  if (cast<llvm::GlobalVariable>(LockVar)->getAddressSpace() !=
-      RuntimeFcn.getFunctionType()
-          ->getParamType(LockVarArgIdx)
-          ->getPointerAddressSpace())
-    LockVar = CGF.Builder.CreateAddrSpaceCast(
-        LockVar, RuntimeFcn.getFunctionType()->getParamType(LockVarArgIdx));
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
-                         LockVar};
+                         getCriticalRegionLock(CriticalName)};
   llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
                                                 std::end(Args));
   if (Hint) {
     EnterArgs.push_back(CGF.Builder.CreateIntCast(
         CGF.EmitScalarExpr(Hint), CGM.Int32Ty, /*isSigned=*/false));
   }
-  CommonActionTy Action(RuntimeFcn, EnterArgs,
-                        OMPBuilder.getOrCreateRuntimeFunction(
-                            CGM.getModule(), OMPRTL___kmpc_end_critical),
-                        Args);
+  CommonActionTy Action(
+      OMPBuilder.getOrCreateRuntimeFunction(
+          CGM.getModule(),
+          Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical),
+      EnterArgs,
+      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
+                                            OMPRTL___kmpc_end_critical),
+      Args);
   CriticalOpGen.setAction(Action);
   emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
 }
diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c
index 45c0e28b525da..5c63a9a5e7004 100644
--- a/clang/test/OpenMP/force-usm.c
+++ b/clang/test/OpenMP/force-usm.c
@@ -46,7 +46,7 @@ int main(void) {
 // CHECK-USM-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-USM:       user_code.entry:
 // CHECK-USM-NEXT:    store i32 1, ptr [[TMP0]], align 4
-// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(1) @pGI_decl_tgt_ref_ptr, align 8
+// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
 // CHECK-USM-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
 // CHECK-USM-NEXT:    store i32 2, ptr [[TMP3]], align 4
 // CHECK-USM-NEXT:    call void @__kmpc_target_deinit()
diff --git a/clang/test/OpenMP/spirv_target_codegen_basic.cpp b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
index 6e029fb93644d..fb2810e88c063 100644
--- a/clang/test/OpenMP/spirv_target_codegen_basic.cpp
+++ b/clang/test/OpenMP/spirv_target_codegen_basic.cpp
@@ -6,18 +6,12 @@
 // CHECK: @__omp_offloading_{{.*}}_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer
 // CHECK: @__omp_offloading_{{.*}}_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy
 
-// CHECK: @"_gomp_critical_user_$var" = common addrspace(1) global [8 x i32] zeroinitializer, align 8
-
 // CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}}
 
-// CHECK: call spir_func addrspace(9) void @__kmpc_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
-// CHECK: call spir_func addrspace(9) void @__kmpc_end_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4)))
-
 int main() {
   int ret = 0;
   #pragma omp target
   for(int i = 0; i < 5; i++)
-    #pragma omp critical
     ret++;
   return ret;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 48d8b695c9aa8..857c5da91a9f5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -3669,7 +3669,7 @@ class OpenMPIRBuilder {
   /// \param Name Name of the variable.
   LLVM_ABI GlobalVariable *
   getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
-                              std::optional<unsigned> AddressSpace = {});
+                              unsigned AddressSpace = 0);
 };
 
 /// Class to represented the control flow structure of an OpenMP canonical loop.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2a43961fa167e..377e180d1d044 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -8567,8 +8567,9 @@ OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
                                                 Config.separator());
 }
 
-GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
-    Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
+GlobalVariable *
+OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
+                                             unsigned AddressSpace) {
   auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
   if (Elem.second) {
     assert(Elem.second->getValueType() == Ty &&
@@ -8578,18 +8579,16 @@ GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
     // variable for possibly changing that to internal or private, or maybe
     // create different versions of the function for different OMP internal
     // variables.
-    const DataLayout &DL = M.getDataLayout();
-    unsigned AddressSpaceVal =
-        AddressSpace ? *AddressSpace : DL.getDefaultGlobalsAddressSpace();
     auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
                        ? GlobalValue::InternalLinkage
                        : GlobalValue::CommonLinkage;
     auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
                                   Constant::getNullValue(Ty), Elem.first(),
                                   /*InsertBefore=*/nullptr,
-                                  GlobalValue::NotThreadLocal, AddressSpaceVal);
+                                  GlobalValue::NotThreadLocal, AddressSpace);
+    const DataLayout &DL = M.getDataLayout();
     const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
-    const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
+    const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
     GV->setAlignment(std::max(TypeAlign, PtrAlign));
     Elem.second = GV;
   }
diff --git a/revert_patches.txt b/revert_patches.txt
index 9e465ba90ae6a..b88b846f64b68 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -5,3 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
 breaks build of ROCmValidationSuite
 [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
 ---
+breaks fortran declare-target-link1
+[OMPIRBuilder] Fix addrspace of internal critical section lock (#166459
+---