From 13a39eaa0bcf6c439e8b59571f4afe593d658623 Mon Sep 17 00:00:00 2001
From: hstk30-hw <hanwei62@huawei.com>
Date: Mon, 24 Nov 2025 12:49:07 +0800
Subject: [PATCH 01/38] [Sema] Fix Wunused-but-set-variable warning(NFC)
 (#169220)

Fix warning:
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp:1455:23: warning:
variable 'Store' set but not used [-Wunused-but-set-variable]
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 003a049da91e4..2220d1a045de4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1489,6 +1489,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
               // operand against vputils::isSingleScalar.
               assert(RepOrWidenR != Store->getStoredValue() ||
                      vputils::isSingleScalar(Store->getStoredValue()));
+              (void)Store;
               return true;
             }
 

From b53e46f71af06e0338ddff8d6d3c87230d4b441d Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani@intel.com>
Date: Mon, 24 Nov 2025 11:03:07 +0530
Subject: [PATCH 02/38] [mlir][x86vector] Lower vector.contract to FMA or
 packed type dot-product (#168074)

A `transform` pass to lower `vector.contract` to (a) `vector.fma` for
`F32`, (b) `x86vector.avx512.dot` for `BF16`, (c) `x86vector.avx.dot.i8`
for `Int8` packed types.

The lowering works on condition with `m`, `batch`, `k` dims to be `one`
and `vnni` dim should be `2` for `bf16`; `4` for `int8`.

**The lowering pattern**: `batch_reduce.matmul` (input) ->
register-tiling(M, N) -> Vectorization (to `vector.contract`) ->
`unroll` vector.contract (`unit` dims) -> `hoisting` transformation
(move `C` loads/store outside batch/k loop) -> apply `licm`,
`canonicalization`, and `bufferize`.
---
 .../mlir/Dialect/X86Vector/CMakeLists.txt     |   2 +
 .../X86Vector/TransformOps/CMakeLists.txt     |   4 +
 .../TransformOps/X86VectorTransformOps.h      |  31 +
 .../TransformOps/X86VectorTransformOps.td     |  43 ++
 .../mlir/Dialect/X86Vector/Transforms.h       |  12 +
 mlir/lib/Dialect/X86Vector/CMakeLists.txt     |   1 +
 .../X86Vector/TransformOps/CMakeLists.txt     |  17 +
 .../TransformOps/X86VectorTransformOps.cpp    |  64 ++
 .../X86Vector/Transforms/CMakeLists.txt       |   2 +
 .../Transforms/VectorContractToFMA.cpp        | 143 ++++
 .../VectorContractToPackedTypeDotProduct.cpp  | 301 ++++++++
 mlir/lib/RegisterAllExtensions.cpp            |   2 +
 .../X86Vector/vector-contract-to-fma.mlir     | 344 +++++++++
 ...or-contract-to-packed-type-dotproduct.mlir | 681 ++++++++++++++++++
 14 files changed, 1647 insertions(+)
 create mode 100644 mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt
 create mode 100644 mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h
 create mode 100644 mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td
 create mode 100644 mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt
 create mode 100644 mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp
 create mode 100644 mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp
 create mode 100644 mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp
 create mode 100644 mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir
 create mode 100644 mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir

diff --git a/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt b/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt
index 0fe01824b8248..bbe8e4eb892dd 100644
--- a/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt
@@ -3,3 +3,5 @@ add_mlir_doc(X86Vector X86Vector Dialects/ -gen-dialect-doc -dialect=x86vector)
 
 add_mlir_interface(X86VectorInterfaces)
 add_dependencies(MLIRX86VectorIncGen MLIRX86VectorInterfacesIncGen)
+
+add_subdirectory(TransformOps)
diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt
new file mode 100644
index 0000000000000..6f377e10fa8f8
--- /dev/null
+++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS X86VectorTransformOps.td)
+mlir_tablegen(X86VectorTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(X86VectorTransformOps.cpp.inc -gen-op-defs)
+add_mlir_dialect_tablegen_target(MLIRX86VectorTransformOpsIncGen)
diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h
new file mode 100644
index 0000000000000..e1d8b8762e799
--- /dev/null
+++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h
@@ -0,0 +1,31 @@
+//===- X86VectorTransformOps.h - X86Vector transform ops --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_X86VECTOR_TRANSFORMOPS_X86VECTORTRANSFORMOPS_H
+#define MLIR_DIALECT_X86VECTOR_TRANSFORMOPS_X86VECTORTRANSFORMOPS_H
+
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+
+//===----------------------------------------------------------------------===//
+// X86Vector Transform Operations
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h.inc"
+
+namespace mlir {
+class DialectRegistry;
+
+namespace x86vector {
+void registerTransformDialectExtension(DialectRegistry &registry);
+
+} // namespace x86vector
+} // namespace mlir
+
+#endif // MLIR_DIALECT_X86VECTOR_TRANSFORMOPS_X86VECTORTRANSFORMOPS_H
diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td
new file mode 100644
index 0000000000000..3c5294ff14fc7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td
@@ -0,0 +1,43 @@
+//===- X86VectorTransformOps.td - X86Vector transform ops --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86VECTOR_TRANSFORM_OPS
+#define X86VECTOR_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Transform/IR/TransformAttrs.td"
+include "mlir/Dialect/Transform/IR/TransformTypes.td"
+include "mlir/IR/RegionKindInterface.td"
+
+def ApplyVectorContractToFMAPatternsOp : Op<Transform_Dialect,
+    "apply_patterns.x86vector.vector_contract_to_fma",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+    Collect patterns to lower a F32 type vector.contract operation to a FMA.
+  }];
+  
+  let assemblyFormat = "attr-dict";
+}
+
+def ApplyVectorContractToPackedTypeDotProductPatternsOp : Op<Transform_Dialect,
+    "apply_patterns.x86vector.vector_contract_to_packed_type_dot_product",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+    Collect patterns to lower a BF16/Int8 type vector.contract operation 
+	to a BF16/Int8 dot-product.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
+
+#endif // X86VECTOR_TRANSFORM_OPS
+
diff --git a/mlir/include/mlir/Dialect/X86Vector/Transforms.h b/mlir/include/mlir/Dialect/X86Vector/Transforms.h
index d54111ca41e69..fc46dff63c2b7 100644
--- a/mlir/include/mlir/Dialect/X86Vector/Transforms.h
+++ b/mlir/include/mlir/Dialect/X86Vector/Transforms.h
@@ -79,6 +79,18 @@ struct MaskHelper {
   }
 };
 
+//===----------------------------------------------------------------------===//
+
+// A set of patterns for specialized lowering of vector contraction
+// operation to vector fused multiply and add (FMA) operation.
+void populateVectorContractToFMAPatterns(RewritePatternSet &patterns);
+
+// A set of patterns for lowering 32-bit packed vector contraction operations
+// to their corresponding packed-type dot-product operations, ultimately
+// targeting the relevant x86 LLVM intrinsics (e.g., BF16 and Int8).
+void populateVectorContractToPackedTypeDotProductPatterns(
+    RewritePatternSet &patterns);
+
 //===----------------------------------------------------------------------===//
 /// Helpers extracted from:
 ///   - clang/lib/Headers/avxintrin.h
diff --git a/mlir/lib/Dialect/X86Vector/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/CMakeLists.txt
index 9f57627c321fb..cb1e9d01821a2 100644
--- a/mlir/lib/Dialect/X86Vector/CMakeLists.txt
+++ b/mlir/lib/Dialect/X86Vector/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(TransformOps)
diff --git a/mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt
new file mode 100644
index 0000000000000..f4c9f8a05acbc
--- /dev/null
+++ b/mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_mlir_dialect_library(MLIRX86VectorTransformOps
+  X86VectorTransformOps.cpp
+
+  DEPENDS
+  MLIRX86VectorTransformOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRLLVMCommonConversion
+  MLIRLLVMDialect
+  MLIRVectorDialect
+  MLIRSideEffectInterfaces
+  MLIRTransformDialect
+  MLIRTransformDialectUtils
+  MLIRX86VectorDialect
+  MLIRX86VectorTransforms
+  )
diff --git a/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp
new file mode 100644
index 0000000000000..95db208207672
--- /dev/null
+++ b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp
@@ -0,0 +1,64 @@
+//===- X86VectorTransformOps.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/Dialect/X86Vector/X86VectorDialect.h"
+
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/RegionKindInterface.h"
+
+using namespace mlir;
+using namespace mlir::x86vector;
+using namespace mlir::transform;
+
+void mlir::transform::ApplyVectorContractToFMAPatternsOp::populatePatterns(
+    RewritePatternSet &patterns) {
+  x86vector::populateVectorContractToFMAPatterns(patterns);
+}
+
+void mlir::transform::ApplyVectorContractToPackedTypeDotProductPatternsOp::
+    populatePatterns(RewritePatternSet &patterns) {
+  x86vector::populateVectorContractToPackedTypeDotProductPatterns(patterns);
+}
+
+//===----------------------------------------------------------------------===//
+// Transform op registration
+//===----------------------------------------------------------------------===//
+
+namespace {
+class X86VectorTransformDialectExtension
+    : public transform::TransformDialectExtension<
+          X86VectorTransformDialectExtension> {
+public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      X86VectorTransformDialectExtension)
+
+  X86VectorTransformDialectExtension() {
+    declareGeneratedDialect<x86vector::X86VectorDialect>();
+    declareGeneratedDialect<LLVM::LLVMDialect>();
+    registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp.inc"
+        >();
+  }
+};
+} // namespace
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp.inc"
+
+void mlir::x86vector::registerTransformDialectExtension(
+    DialectRegistry &registry) {
+  registry.addExtensions<X86VectorTransformDialectExtension>();
+}
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
index c51266afe9e8f..3d2288049e49e 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
@@ -1,6 +1,8 @@
 add_mlir_dialect_library(MLIRX86VectorTransforms
   AVXTranspose.cpp
   LegalizeForLLVMExport.cpp
+  VectorContractToFMA.cpp
+  VectorContractToPackedTypeDotProduct.cpp
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp
new file mode 100644
index 0000000000000..f3af5ca167a35
--- /dev/null
+++ b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp
@@ -0,0 +1,143 @@
+//===- VectorContractToFMA.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/Dialect/X86Vector/X86VectorDialect.h"
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+using namespace mlir::x86vector;
+
+namespace {
+
+// Implements outer product contraction as a sequence of broadcast and
+// FMA operations.
+//
+// For example - for F32 type:
+// ```
+//   vector.contract <1x1xf32>, <1x16xf32> into <1x16xf32>
+// ```
+// to
+// ```
+//   vector.broadcast %lhs to <16xf32>
+//   vector.fma vector<16xf32>
+// ```
+struct VectorContractToFMA : public OpRewritePattern<vector::ContractionOp> {
+  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
+                                PatternRewriter &rewriter) const override {
+
+    if (contractOp.getKind() != vector::CombiningKind::ADD)
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Expects add combining kind.");
+
+    VectorType lhsTy = contractOp.getLhsType();
+    if (!lhsTy.getElementType().isF32())
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Only F32 lowering is supported.");
+
+    ArrayRef<int64_t> lhsShape = lhsTy.getShape();
+    llvm::SmallVector<int64_t> nonUnitDimLhs;
+    llvm::copy_if(lhsShape, std::back_inserter(nonUnitDimLhs),
+                  [](int64_t dim) { return dim != 1; });
+
+    VectorType rhsTy = contractOp.getRhsType();
+    ArrayRef<int64_t> rhsShape = rhsTy.getShape();
+    llvm::SmallVector<int64_t> nonUnitDimRhs;
+    llvm::copy_if(rhsShape, std::back_inserter(nonUnitDimRhs),
+                  [](int64_t dim) { return dim != 1; });
+
+    if (nonUnitDimLhs.size() > 0 && nonUnitDimRhs.size() > 0)
+      return rewriter.notifyMatchFailure(
+          contractOp, "Excepts unit dimensions for either LHS or RHS shape.");
+
+    if (nonUnitDimLhs.size() != 1 && nonUnitDimRhs.size() != 1)
+      return rewriter.notifyMatchFailure(
+          contractOp,
+          "Excepts a one non-unit A/B dimension for either LHS or RHS shape.");
+
+    VectorType accTy = dyn_cast<VectorType>(contractOp.getAccType());
+    if (!accTy)
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Accmulator is not a vector type");
+
+    if (!accTy.getElementType().isF32())
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Accmulator should be F32 type.");
+
+    ArrayRef<int64_t> accShape = accTy.getShape();
+    llvm::SmallVector<int64_t> nonUnitDimAcc;
+    llvm::copy_if(accShape, std::back_inserter(nonUnitDimAcc),
+                  [](int64_t dim) { return dim != 1; });
+    if (nonUnitDimAcc.size() != 1)
+      return rewriter.notifyMatchFailure(
+          contractOp, "A or B dimension should be non-unit.");
+
+    // Lowers vector.contract into a broadcast+FMA sequence.
+    auto loc = contractOp.getLoc();
+    auto castAcc = vector::ShapeCastOp::create(
+        rewriter, loc,
+        VectorType::get(nonUnitDimAcc.front(), accTy.getElementType()),
+        contractOp.getAcc());
+
+    vector::FMAOp fma;
+
+    // Broadcast the unit-dimension LHS or RHS to match the vector length of the
+    // corresponding non-unit dimension on the other operand. For example,
+    // if LHS has type vector<1x1xf32> and RHS has type vector<1x16xf32>, we
+    // broadcast the LHS to vector<1x16xf32>. In the opposite case (non-unit
+    // dimension on the LHS), we broadcast the RHS instead.
+    if (nonUnitDimRhs.size() > 0) {
+      auto castLhs = vector::ShapeCastOp::create(
+          rewriter, loc, VectorType::get(1, lhsTy.getElementType()),
+          contractOp.getLhs());
+      auto castRhs = vector::ShapeCastOp::create(
+          rewriter, loc,
+          VectorType::get(nonUnitDimRhs.front(), rhsTy.getElementType()),
+          contractOp.getRhs());
+      auto broadcastLhs = vector::BroadcastOp::create(
+          rewriter, loc, castRhs.getResult().getType(), castLhs);
+      fma =
+          vector::FMAOp::create(rewriter, loc, broadcastLhs, castRhs, castAcc);
+    } else {
+      auto castLhs = vector::ShapeCastOp::create(
+          rewriter, loc,
+          VectorType::get(nonUnitDimLhs.front(), lhsTy.getElementType()),
+          contractOp.getLhs());
+      auto castRhs = vector::ShapeCastOp::create(
+          rewriter, loc, VectorType::get(1, rhsTy.getElementType()),
+          contractOp.getRhs());
+      auto broadcastRhs = vector::BroadcastOp::create(
+          rewriter, loc, castLhs.getResult().getType(), castRhs);
+      fma =
+          vector::FMAOp::create(rewriter, loc, castLhs, broadcastRhs, castAcc);
+    }
+
+    auto castFma = vector::ShapeCastOp::create(rewriter, loc, accTy, fma);
+    rewriter.replaceOp(contractOp, castFma);
+
+    return success();
+  }
+};
+
+} // namespace
+
+void x86vector::populateVectorContractToFMAPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<VectorContractToFMA>(patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp
new file mode 100644
index 0000000000000..1e64811db910b
--- /dev/null
+++ b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp
@@ -0,0 +1,301 @@
+//===- VectorContractToPackedTypeDotProduct.cpp ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/Dialect/X86Vector/X86VectorDialect.h"
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+using namespace mlir::x86vector;
+
+namespace {
+
+static FailureOr<SmallVector<mlir::utils::IteratorType>>
+inferIteratorsFromOutMap(AffineMap map) {
+  if (!map.isProjectedPermutation())
+    return failure();
+  SmallVector<mlir::utils::IteratorType> iterators(
+      map.getNumDims(), mlir::utils::IteratorType::reduction);
+  for (auto expr : map.getResults())
+    if (auto dim = dyn_cast<AffineDimExpr>(expr))
+      iterators[dim.getPosition()] = mlir::utils::IteratorType::parallel;
+  return iterators;
+}
+
+// Returns true if the operation is in VNNI layout.
+// Optionally, the check can be constrained to a specific VNNI blocking factor.
+static bool isInVnniLayout(Operation *op, ArrayRef<AffineMap> indexingMaps,
+                           std::optional<unsigned> blockingFactor) {
+  // Narrow down type operations - VNNI only applies to contractions.
+  FailureOr<linalg::ContractionDimensions> dims =
+      linalg::inferContractionDims(indexingMaps);
+  if (failed(dims))
+    return false;
+
+  auto matA = op->getOperand(0);
+  auto matB = op->getOperand(1);
+  auto typeA = dyn_cast<ShapedType>(matA.getType());
+  auto typeB = dyn_cast<ShapedType>(matB.getType());
+  unsigned rankA = typeA.getRank();
+  unsigned rankB = typeB.getRank();
+  // VNNI format requires at least 1 parallel and 2 reduction dimensions.
+  if (rankA < 3 || rankB < 3)
+    return false;
+
+  // At least two reduction dimensions are expected:
+  // one for the VNNI factor and one for the K dimension
+  if (dims->k.size() < 2)
+    return false;
+
+  // Validate affine maps - VNNI computation should be defined by the two
+  // innermost reduction iterators.
+  // The input matrix dimensions layout must match the following:
+  //   - matrix A - [...][K/vnniFactor][vnniFactor]
+  //   - matrix B - [...][K/vnniFactor][N][vnniFactor]
+  auto maybeIters = inferIteratorsFromOutMap(indexingMaps[2]);
+  if (failed(maybeIters))
+    return false;
+  SmallVector<mlir::utils::IteratorType> iteratorTypes = *maybeIters;
+  AffineMap mapA = indexingMaps[0];
+  AffineMap mapB = indexingMaps[1];
+
+  auto vnniDimA = dyn_cast<AffineDimExpr>(mapA.getResult(rankA - 1));
+  auto vnniDimB = dyn_cast<AffineDimExpr>(mapB.getResult(rankB - 1));
+  if (!vnniDimA || !vnniDimB || vnniDimA != vnniDimB ||
+      iteratorTypes[vnniDimA.getPosition()] !=
+          mlir::utils::IteratorType::reduction)
+    return false;
+  auto redDimA = dyn_cast<AffineDimExpr>(mapA.getResult(rankA - 2));
+  auto redDimB = dyn_cast<AffineDimExpr>(mapB.getResult(rankB - 3));
+  if (!redDimA || !redDimB || redDimA != redDimB ||
+      iteratorTypes[redDimA.getPosition()] !=
+          mlir::utils::IteratorType::reduction)
+    return false;
+  auto parallelDimB = dyn_cast<AffineDimExpr>(mapB.getResult(rankB - 2));
+  if (!parallelDimB || iteratorTypes[parallelDimB.getPosition()] !=
+                           mlir::utils::IteratorType::parallel)
+    return false;
+
+  // VNNI factor must be:
+  //   - the innermost inputs' dimension
+  //   - statically known
+  //   - multiple of 2 or equal to the specified factor
+  auto vnniDimSize = typeB.getShape().back();
+  if (vnniDimSize == ShapedType::kDynamic || vnniDimSize == 0 ||
+      vnniDimSize % 2 != 0)
+    return false;
+  if (typeA.getShape().back() != vnniDimSize)
+    return false;
+  if (blockingFactor && vnniDimSize != *blockingFactor)
+    return false;
+
+  // The split reduction dimension size should also match.
+  if (typeA.getShape().end()[-2] != typeB.getShape().end()[-3])
+    return false;
+
+  return true;
+}
+
+// Implements packed type outer product contraction as a sequence
+// of broadcast and packed dot-product operations.
+//
+// For example - for F32 type:
+// ```
+//   vector.contract <1x1x2xbf16>, <1x16x2xbf16> into <1x16xf32>
+// ```
+// to
+// ```
+//   vector.broadcast %lhs to <32xbf16>
+//   x86vector.avx512.dot vector<32xbf16> -> vector<16xf32>
+// ```
+struct VectorContractToPackedTypeDotProduct
+    : public OpRewritePattern<vector::ContractionOp> {
+  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
+                                PatternRewriter &rewriter) const override {
+
+    if (contractOp.getKind() != vector::CombiningKind::ADD)
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Expects add combining kind.");
+
+    VectorType lhsTy = contractOp.getLhsType();
+    if (!lhsTy.getElementType().isBF16() &&
+        !lhsTy.getElementType().isSignlessInteger(8))
+      return rewriter.notifyMatchFailure(
+          contractOp, "Only BF16/Int8 lowering is supported.");
+
+    unsigned int blockingFactor = lhsTy.getElementType().isBF16() ? 2 : 4;
+    if (!isInVnniLayout(contractOp.getOperation(),
+                        contractOp.getIndexingMapsArray(), blockingFactor))
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Input matrices not in VNNI format.");
+
+    ArrayRef<int64_t> lhsShape = lhsTy.getShape();
+    llvm::SmallVector<int64_t> nonUnitDimLhs;
+    llvm::copy_if(lhsShape, std::back_inserter(nonUnitDimLhs),
+                  [](int64_t dim) { return dim != 1; });
+
+    VectorType rhsTy = contractOp.getRhsType();
+    ArrayRef<int64_t> rhsShape = rhsTy.getShape();
+    llvm::SmallVector<int64_t> nonUnitDimRhs;
+    llvm::copy_if(rhsShape, std::back_inserter(nonUnitDimRhs),
+                  [](int64_t dim) { return dim != 1; });
+
+    if ((nonUnitDimLhs.size() - 1) > 0 && (nonUnitDimRhs.size() - 1) > 0)
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Excepts unit dimensions for either "
+                                         "LHS or RHS shape other than VNNI.");
+
+    if ((nonUnitDimLhs.size() - 1) != 1 && (nonUnitDimRhs.size() - 1) != 1)
+      return rewriter.notifyMatchFailure(
+          contractOp,
+          "Excepts a one non-unit A/B dimension for either LHS or RHS shape.");
+
+    VectorType accTy = dyn_cast<VectorType>(contractOp.getAccType());
+    if (!accTy)
+      return rewriter.notifyMatchFailure(contractOp, "Wrong accmulator type.");
+
+    if ((lhsTy.getElementType().isBF16() && !accTy.getElementType().isF32()) ||
+        (lhsTy.getElementType().isSignlessInteger(8) &&
+         !accTy.getElementType().isSignlessInteger(32)))
+      return rewriter.notifyMatchFailure(contractOp,
+                                         "Only F32 for BF16 or Int32 for Int8 "
+                                         "accumulation type is supported.");
+
+    ArrayRef<int64_t> accShape = accTy.getShape();
+    llvm::SmallVector<int64_t> nonUnitDimAcc;
+    llvm::copy_if(accShape, std::back_inserter(nonUnitDimAcc),
+                  [](int64_t dim) { return dim != 1; });
+    if (nonUnitDimAcc.size() != 1)
+      return rewriter.notifyMatchFailure(
+          contractOp, "A or B should be a non-unit dim in acc.");
+
+    // Non-unit dimensions should match the vector length of BF16 or Int8
+    // dot-product.
+    unsigned int nonUnitDim = nonUnitDimLhs.size() == 2 ? nonUnitDimLhs.front()
+                                                        : nonUnitDimRhs.front();
+    if (lhsTy.getElementType().isBF16() && nonUnitDim != 4 && nonUnitDim != 8 &&
+        nonUnitDim != 16 && nonUnitDimAcc.front() == nonUnitDim)
+      return rewriter.notifyMatchFailure(
+          contractOp, "BF16 dot-product operation expects non-unit (LHR or "
+                      "RHS) dim and acc dim of size 4/8/16.");
+
+    if (lhsTy.getElementType().isSignlessInteger(8) && nonUnitDim != 4 &&
+        nonUnitDim != 8 && nonUnitDimAcc.front() == nonUnitDim)
+      return rewriter.notifyMatchFailure(
+          contractOp, "Int8 dot-product operation expects non-unit (LHR or "
+                      "RHS) dim and acc dim of size 4/8.");
+
+    auto loc = contractOp.getLoc();
+    auto castAcc = vector::ShapeCastOp::create(
+        rewriter, loc,
+        VectorType::get(nonUnitDimAcc.front(), accTy.getElementType()),
+        contractOp.getAcc());
+
+    Value dp;
+
+    // Broadcast the unit-dimension LHS or RHS to match the vector length of the
+    // corresponding non-unit dimension on the other operand. For example,
+    // if LHS has type vector<1x1x2xbf16> and RHS has type vector<1x16x2xbf16>,
+    // we broadcast the LHS to vector<16x2xbf16>. In the opposite case (non-unit
+    // dimension on the LHS), we broadcast the RHS instead.
+    if ((nonUnitDimRhs.size() - 1) > 0) {
+      auto castRhs = vector::ShapeCastOp::create(
+          rewriter, loc,
+          VectorType::get(nonUnitDimRhs.front() * nonUnitDimRhs.back(),
+                          rhsTy.getElementType()),
+          contractOp.getRhs());
+      auto castLhs = vector::ShapeCastOp::create(
+          rewriter, loc,
+          VectorType::get(nonUnitDimLhs.front(), lhsTy.getElementType()),
+          contractOp.getLhs());
+      auto bitcastLhs = vector::BitCastOp::create(
+          rewriter, loc, VectorType::get({1}, rewriter.getIntegerType(32)),
+          castLhs);
+      auto broadcastLhs = vector::BroadcastOp::create(
+          rewriter, loc,
+          VectorType::get({nonUnitDimRhs.front()}, rewriter.getIntegerType(32)),
+          bitcastLhs);
+      auto bitcastLhsPkType = vector::BitCastOp::create(
+          rewriter, loc, castRhs.getResult().getType(), broadcastLhs);
+
+      if (lhsTy.getElementType().isBF16()) {
+        dp = x86vector::DotBF16Op::create(
+            rewriter, loc,
+            VectorType::get(nonUnitDimRhs.front(), rewriter.getF32Type()),
+            castAcc, bitcastLhsPkType, castRhs);
+      }
+
+      if (lhsTy.getElementType().isSignlessInteger(8)) {
+        dp = x86vector::DotInt8Op::create(
+            rewriter, loc,
+            VectorType::get(nonUnitDimRhs.front(), rewriter.getIntegerType(32)),
+            castAcc, bitcastLhsPkType, castRhs);
+      }
+    } else {
+      auto castLhs = vector::ShapeCastOp::create(
+          rewriter, loc,
+          VectorType::get(nonUnitDimLhs.front() * nonUnitDimLhs.back(),
+                          lhsTy.getElementType()),
+          contractOp.getLhs());
+      auto castRhs = vector::ShapeCastOp::create(
+          rewriter, loc,
+          VectorType::get(nonUnitDimRhs.front(), rhsTy.getElementType()),
+          contractOp.getRhs());
+      auto bitcastRhs = vector::BitCastOp::create(
+          rewriter, loc, VectorType::get({1}, rewriter.getIntegerType(32)),
+          castRhs);
+      auto broadcastRhs = vector::BroadcastOp::create(
+          rewriter, loc,
+          VectorType::get({nonUnitDimLhs.front()}, rewriter.getIntegerType(32)),
+          bitcastRhs);
+      auto bitcastRhsPkType = vector::BitCastOp::create(
+          rewriter, loc, castLhs.getResult().getType(), broadcastRhs);
+
+      if (lhsTy.getElementType().isBF16()) {
+        dp = x86vector::DotBF16Op::create(
+            rewriter, loc,
+            VectorType::get(nonUnitDimLhs.front(), rewriter.getF32Type()),
+            castAcc, castLhs, bitcastRhsPkType);
+      }
+
+      if (lhsTy.getElementType().isSignlessInteger(8)) {
+        dp = x86vector::DotInt8Op::create(
+            rewriter, loc,
+            VectorType::get(nonUnitDimLhs.front(), rewriter.getIntegerType(32)),
+            castAcc, castLhs, bitcastRhsPkType);
+      }
+    }
+
+    if (!dp)
+      return failure();
+
+    auto castDp = vector::ShapeCastOp::create(rewriter, loc, accTy, dp);
+    rewriter.replaceOp(contractOp, castDp);
+    return success();
+  }
+};
+
+} // namespace
+
+void x86vector::populateVectorContractToPackedTypeDotProductPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<VectorContractToPackedTypeDotProduct>(patterns.getContext());
+}
diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp
index c857c38df717c..4312100a0c0b0 100644
--- a/mlir/lib/RegisterAllExtensions.cpp
+++ b/mlir/lib/RegisterAllExtensions.cpp
@@ -56,6 +56,7 @@
 #include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h"
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h"
 #include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h"
 #include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
@@ -113,6 +114,7 @@ void mlir::registerAllExtensions(DialectRegistry &registry) {
   transform::registerSMTExtension(registry);
   transform::registerTuneExtension(registry);
   vector::registerTransformDialectExtension(registry);
+  x86vector::registerTransformDialectExtension(registry);
   xegpu::registerTransformDialectExtension(registry);
   arm_neon::registerTransformDialectExtension(registry);
   arm_sve::registerTransformDialectExtension(registry);
diff --git a/mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir b/mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir
new file mode 100644
index 0000000000000..e506b166d43ff
--- /dev/null
+++ b/mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir
@@ -0,0 +1,344 @@
+// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s
+
+!vecA = vector<1x1xf32>
+!vecB = vector<1x64xf32>
+!vecC = vector<1x64xf32>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @matmul_outer_product_to_fma(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @matmul_outer_product_to_fma
+// CHECK: vector.broadcast{{.*}}vector<1xf32> to vector<64xf32>
+// CHECK: vector.fma{{.*}}vector<64xf32>
+// CHECK: vector.shape_cast{{.*}}vector<64xf32> to vector<1x64xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<64x1xf32>
+!vecB = vector<1x1xf32>
+!vecC = vector<64x1xf32>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @matmul_outer_product_to_fma_bcst_B(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @matmul_outer_product_to_fma_bcst_B
+// CHECK: vector.broadcast
+// CHECK: vector.fma{{.*}}vector<64xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1xf32>
+!vecB = vector<1x1x64xf32>
+!vecC = vector<1x1x64xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul_to_fma(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @batch_matmul_to_fma
+// CHECK: vector.broadcast
+// CHECK: vector.fma{{.*}}vector<64xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x64x1xf32>
+!vecB = vector<1x1x1xf32>
+!vecC = vector<1x64x1xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul_to_fma_bcst_B(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @batch_matmul_to_fma_bcst_B
+// CHECK: vector.broadcast
+// CHECK: vector.fma{{.*}}vector<64xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1xf32>
+!vecB = vector<1x1x64xf32>
+!vecC = vector<1x64xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+func.func @brgemm_to_fma(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @brgemm_to_fma
+// CHECK: vector.broadcast
+// CHECK: vector.fma{{.*}}vector<64xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x64x1xf32>
+!vecB = vector<1x1x1xf32>
+!vecC = vector<64x1xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+func.func @brgemm_to_fma_bcst_B(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @brgemm_to_fma_bcst_B
+// CHECK: vector.broadcast
+// CHECK: vector.fma{{.*}}vector<64xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<3x1x1xf32>
+!vecB = vector<3x1x64xf32>
+!vecC = vector<3x1x64xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_non_unit_batch_dim(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// Batch dimension should've been simplified earlier.
+
+// CHECK-LABEL: @negative_non_unit_batch_dim
+// CHECK-NOT: vector.fma
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<3x1x1xf32>
+!vecB = vector<3x1x64xf32>
+!vecC = vector<1x64xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+func.func @negative_non_unit_batch_reduce_dim(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// Batch-reduce dimension should've been simplified earlier.
+
+// CHECK-LABEL: @negative_non_unit_batch_reduce_dim
+// CHECK-NOT: vector.fma
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1xf32>
+!vecB = vector<1x64xf32>
+!vecC = vector<1x64xf32>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @negative_invalid_kind(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "reduction"],
+    kind = #vector.kind<mul>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_invalid_kind
+// CHECK-NOT: vector.fma
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1xf32>
+!vecB = vector<1x1x64xf32>
+!vecC = vector<1x1x64xi32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_accumulator_type(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_accumulator_type
+// CHECK-NOT: vector.fma
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_fma
+    } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir b/mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir
new file mode 100644
index 0000000000000..65676cbae772c
--- /dev/null
+++ b/mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir
@@ -0,0 +1,681 @@
+// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s
+
+!vecA = vector<1x1x1x2xbf16>
+!vecB = vector<1x1x16x2xbf16>
+!vecC = vector<1x16xf32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)>
+func.func @brgemm_to_bf16dp(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @brgemm_to_bf16dp
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx512.dot
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x16x1x2xbf16>
+!vecB = vector<1x1x1x2xbf16>
+!vecC = vector<16x1xf32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)>
+func.func @brgemm_to_bf16dp_bcst_B(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @brgemm_to_bf16dp_bcst_B
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx512.dot
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x4xi8>
+!vecB = vector<1x1x8x4xi8>
+!vecC = vector<1x8xi32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)>
+func.func @brgemm_to_int8dp(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @brgemm_to_int8dp
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx.dot.i8
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x2xbf16>
+!vecB = vector<1x1x16x2xbf16>
+!vecC = vector<1x1x16xf32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul_bf16dp(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @batch_matmul_bf16dp
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx512.dot
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x4xi8>
+!vecB = vector<1x1x8x4xi8>
+!vecC = vector<1x1x8xi32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul_int8dp(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+
+// CHECK-LABEL: @batch_matmul_int8dp
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx.dot.i8
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x8x1x4xi8>
+!vecB = vector<1x1x1x4xi8>
+!vecC = vector<1x8x1xi32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @batch_matmul_int8dp_bcst_B(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+
+// CHECK-LABEL: @batch_matmul_int8dp_bcst_B
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx.dot.i8
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x2xbf16>
+!vecB = vector<1x16x2xbf16>
+!vecC = vector<1x16xf32>
+#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)>
+#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)>
+#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)>
+func.func @matmul_outer_product_to_bf16dp(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @matmul_outer_product_to_bf16dp
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx512.dot
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<16x1x2xbf16>
+!vecB = vector<1x1x2xbf16>
+!vecC = vector<16x1xf32>
+#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)>
+#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)>
+#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)>
+func.func @matmul_outer_product_to_bf16dp_bcst_B(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @matmul_outer_product_to_bf16dp_bcst_B
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx512.dot
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x4xi8>
+!vecB = vector<1x8x4xi8>
+!vecC = vector<1x8xi32>
+#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)>
+#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)>
+#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)>
+func.func @matmul_outer_product_to_int8dp(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @matmul_outer_product_to_int8dp
+// CHECK: vector.broadcast
+// CHECK: x86vector.avx.dot.i8
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x2xbf16>
+!vecB = vector<1x16x2xbf16>
+!vecC = vector<1x16xf32>
+#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)>
+#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)>
+#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)>
+func.func @negative_invalid_vc_kind(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<mul>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_invalid_vc_kind
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x4xbf16>
+!vecB = vector<1x1x16x4xbf16>
+!vecC = vector<1x16xf32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)>
+func.func @negative_false_vnni_bf16(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_false_vnni_bf16
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x2xi8>
+!vecB = vector<1x1x8x2xi8>
+!vecC = vector<1x8xi32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)>
+func.func @negative_false_vnni_int8(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_false_vnni_int8
+// CHECK-NOT: x86vector.avx.dot.i8
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<3x1x1x2xbf16>
+!vecB = vector<3x1x16x2xbf16>
+!vecC = vector<3x1x16xf32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_batch_dimension(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_batch_dimension
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<2x1x1x4xi8>
+!vecB = vector<2x1x8x4xi8>
+!vecC = vector<1x8xi32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)>
+func.func @negative_brgemm_dimension(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_brgemm_dimension
+// CHECK-NOT: x86vector.avx.dot.i8
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x2xbf16>
+!vecB = vector<1x1x16x2xbf16>
+!vecC = vector<1x1x16xbf16>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_float_acc_type(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_float_acc_type
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x4xi8>
+!vecB = vector<1x1x8x4xi8>
+!vecC = vector<1x1x8xi8>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_int_acc_type(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_int_acc_type
+// CHECK-NOT: x86vector.avx.dot.i8
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x4xbf16>
+!vecB = vector<1x1x16x4xbf16>
+!vecC = vector<1x1x16xbf16>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_wrong_vnni_blocking_factor_bf16(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_wrong_vnni_blocking_factor_bf16
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1xbf16>
+!vecB = vector<1x1x32xbf16>
+!vecC = vector<1x32xf32>
+#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)>
+func.func @negative_brgemm_not_vnni(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_brgemm_not_vnni
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x4xi8>
+!vecB = vector<1x1x16x4xi8>
+!vecC = vector<1x1x16xi32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_wrong_vector_shape_int8(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_wrong_vector_shape_int8
+// CHECK-NOT: x86vector.avx.dot.i8
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+!vecA = vector<1x1x1x2xbf16>
+!vecB = vector<1x1x32x2xbf16>
+!vecC = vector<1x1x32xf32>
+#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)>
+#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)>
+#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_wrong_vector_shape_bf16(
+  %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC
+{
+  %0 = vector.contract {
+    indexing_maps = [#map, #map1, #map2],
+    iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"],
+    kind = #vector.kind<add>}
+    %arg0, %arg1, %arg2
+    : !vecA, !vecB into !vecC
+  return %0 : !vecC
+}
+
+// CHECK-LABEL: @negative_wrong_vector_shape_bf16
+// CHECK-NOT: x86vector.avx512.dot
+// CHECK: vector.contract
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product
+    } : !transform.any_op
+    transform.yield
+  }
+}

From 76e7e9fa109e424c18edc2b89e991dac99979599 Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
Date: Mon, 24 Nov 2025 13:55:00 +0800
Subject: [PATCH 03/38] [LoongArch][NFC] Add tests for combining vand(vnot)
 (#160830)

---
 .../CodeGen/LoongArch/lasx/and-not-combine.ll | 426 +++++++++++++++++-
 .../CodeGen/LoongArch/lsx/and-not-combine.ll  | 349 +++++++++++++-
 2 files changed, 771 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll
index 67549599db2f3..aa67a20ab08a7 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64
 
 define void @and_not_combine_v32i8(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
 ; CHECK-LABEL: and_not_combine_v32i8:
@@ -85,3 +85,425 @@ entry:
   store <4 x i64> %and, ptr %res
   ret void
 }
+
+define void @pre_not_and_not_combine_v32i8(ptr %res, ptr %a, i8 %b) nounwind {
+; CHECK-LABEL: pre_not_and_not_combine_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    nor $a1, $a2, $zero
+; CHECK-NEXT:    xvreplgr2vr.b $xr1, $a1
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <32 x i8>, ptr %a
+  %b.not = xor i8 %b, -1
+  %b.not.ele = insertelement <32 x i8> poison, i8 %b.not, i64 0
+  %v1.not = shufflevector <32 x i8> %b.not.ele, <32 x i8> poison, <32 x i32> zeroinitializer
+  %v0.not = xor <32 x i8> %v0, splat (i8 -1)
+  %and = and <32 x i8> %v0.not, %v1.not
+  store <32 x i8> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v32i8(ptr %res, ptr %a, i8 %b) nounwind {
+; CHECK-LABEL: post_not_and_not_combine_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvreplgr2vr.b $xr1, $a2
+; CHECK-NEXT:    xvxori.b $xr1, $xr1, 255
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <32 x i8>, ptr %a
+  %b.ele = insertelement <32 x i8> poison, i8 %b, i64 0
+  %v1 = shufflevector <32 x i8> %b.ele, <32 x i8> poison, <32 x i32> zeroinitializer
+  %v0.not = xor <32 x i8> %v0, splat (i8 -1)
+  %v1.not = xor <32 x i8> %v1, splat (i8 -1)
+  %and = and <32 x i8> %v0.not, %v1.not
+  store <32 x i8> %and, ptr %res
+  ret void
+}
+
+define void @pre_not_and_not_combine_v16i16(ptr %res, ptr %a, i16 %b) nounwind {
+; CHECK-LABEL: pre_not_and_not_combine_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    nor $a1, $a2, $zero
+; CHECK-NEXT:    xvreplgr2vr.h $xr1, $a1
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i16>, ptr %a
+  %b.not = xor i16 %b, -1
+  %b.not.ele = insertelement <16 x i16> poison, i16 %b.not, i64 0
+  %v1.not = shufflevector <16 x i16> %b.not.ele, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v0.not = xor <16 x i16> %v0, splat (i16 -1)
+  %and = and <16 x i16> %v0.not, %v1.not
+  store <16 x i16> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v16i16(ptr %res, ptr %a, i16 %b) nounwind {
+; CHECK-LABEL: post_not_and_not_combine_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvreplgr2vr.h $xr1, $a2
+; CHECK-NEXT:    xvrepli.b $xr2, -1
+; CHECK-NEXT:    xvxor.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i16>, ptr %a
+  %b.ele = insertelement <16 x i16> poison, i16 %b, i64 0
+  %v1 = shufflevector <16 x i16> %b.ele, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v0.not = xor <16 x i16> %v0, splat (i16 -1)
+  %v1.not = xor <16 x i16> %v1, splat (i16 -1)
+  %and = and <16 x i16> %v0.not, %v1.not
+  store <16 x i16> %and, ptr %res
+  ret void
+}
+
+define void @pre_not_and_not_combine_v8i32(ptr %res, ptr %a, i32 %b) nounwind {
+; CHECK-LABEL: pre_not_and_not_combine_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    nor $a1, $a2, $zero
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a1
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i32>, ptr %a
+  %b.not = xor i32 %b, -1
+  %b.not.ele = insertelement <8 x i32> poison, i32 %b.not, i64 0
+  %v1.not = shufflevector <8 x i32> %b.not.ele, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v0.not = xor <8 x i32> %v0, splat (i32 -1)
+  %and = and <8 x i32> %v0.not, %v1.not
+  store <8 x i32> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v8i32(ptr %res, ptr %a, i32 %b) nounwind {
+; CHECK-LABEL: post_not_and_not_combine_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvreplgr2vr.w $xr1, $a2
+; CHECK-NEXT:    xvrepli.b $xr2, -1
+; CHECK-NEXT:    xvxor.v $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i32>, ptr %a
+  %b.ele = insertelement <8 x i32> poison, i32 %b, i64 0
+  %v1 = shufflevector <8 x i32> %b.ele, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v0.not = xor <8 x i32> %v0, splat (i32 -1)
+  %v1.not = xor <8 x i32> %v1, splat (i32 -1)
+  %and = and <8 x i32> %v0.not, %v1.not
+  store <8 x i32> %and, ptr %res
+  ret void
+}
+
+define void @pre_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind {
+; LA32-LABEL: pre_not_and_not_combine_v4i64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    nor $a1, $a3, $zero
+; LA32-NEXT:    nor $a2, $a2, $zero
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
+; LA32-NEXT:    xvreplve0.d $xr1, $xr1
+; LA32-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: pre_not_and_not_combine_v4i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    nor $a1, $a2, $zero
+; LA64-NEXT:    xvreplgr2vr.d $xr1, $a1
+; LA64-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %a
+  %b.not = xor i64 %b, -1
+  %b.not.ele = insertelement <4 x i64> poison, i64 %b.not, i64 0
+  %v1.not = shufflevector <4 x i64> %b.not.ele, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v0.not = xor <4 x i64> %v0, splat (i64 -1)
+  %and = and <4 x i64> %v0.not, %v1.not
+  store <4 x i64> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind {
+; LA32-LABEL: post_not_and_not_combine_v4i64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
+; LA32-NEXT:    xvreplve0.d $xr1, $xr1
+; LA32-NEXT:    xvrepli.b $xr2, -1
+; LA32-NEXT:    xvxor.v $xr1, $xr1, $xr2
+; LA32-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: post_not_and_not_combine_v4i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    xvreplgr2vr.d $xr1, $a2
+; LA64-NEXT:    xvrepli.b $xr2, -1
+; LA64-NEXT:    xvxor.v $xr1, $xr1, $xr2
+; LA64-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; LA64-NEXT:    xvst $xr0, $a0, 0
+; LA64-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %a
+  %b.ele = insertelement <4 x i64> poison, i64 %b, i64 0
+  %v1 = shufflevector <4 x i64> %b.ele, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v0.not = xor <4 x i64> %v0, splat (i64 -1)
+  %v1.not = xor <4 x i64> %v1, splat (i64 -1)
+  %and = and <4 x i64> %v0.not, %v1.not
+  store <4 x i64> %and, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v32i8(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -4
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <32 x i8>, ptr %a0
+  %and = and <32 x i8> %v0, splat (i8 -4)
+  %xor = xor <32 x i8> %and, splat (i8 -4)
+  store <32 x i8> %xor, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v16i16(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.h $xr1, -4
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i16>, ptr %a0
+  %and = and <16 x i16> %v0, splat (i16 -4)
+  %xor = xor <16 x i16> %and, splat (i16 -4)
+  store <16 x i16> %xor, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v8i32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.w $xr1, -4
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i32>, ptr %a0
+  %and = and <8 x i32> %v0, splat (i32 -4)
+  %xor = xor <8 x i32> %and, splat (i32 -4)
+  store <8 x i32> %xor, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v4i64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvrepli.d $xr1, -4
+; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %a0
+  %and = and <4 x i64> %v0, splat (i64 -4)
+  %xor = xor <4 x i64> %and, splat (i64 -4)
+  store <4 x i64> %xor, ptr %res
+  ret void
+}
+
+define void @and_or_not_combine_v32i8(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvld $xr2, $a1, 0
+; CHECK-NEXT:    xvseq.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT:    xvseq.b $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvandi.b $xr0, $xr0, 4
+; CHECK-NEXT:    xvst $xr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <32 x i8>, ptr %pa
+  %b = load <32 x i8>, ptr %pb
+  %v = load <32 x i8>, ptr %pv
+  %ca = icmp ne <32 x i8> %v, %a
+  %cb = icmp ne <32 x i8> %v, %b
+  %or = or <32 x i1> %ca, %cb
+  %ext = sext <32 x i1> %or to <32 x i8>
+  %and = and <32 x i8> %ext, splat (i8 4)
+  store <32 x i8> %and, ptr %dst
+  ret void
+}
+
+define void @and_or_not_combine_v16i16(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvld $xr2, $a1, 0
+; CHECK-NEXT:    xvseq.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvrepli.b $xr3, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr3
+; CHECK-NEXT:    xvseq.h $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvrepli.h $xr1, 4
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <16 x i16>, ptr %pa
+  %b = load <16 x i16>, ptr %pb
+  %v = load <16 x i16>, ptr %pv
+  %ca = icmp ne <16 x i16> %v, %a
+  %cb = icmp ne <16 x i16> %v, %b
+  %or = or <16 x i1> %ca, %cb
+  %ext = sext <16 x i1> %or to <16 x i16>
+  %and = and <16 x i16> %ext, splat (i16 4)
+  store <16 x i16> %and, ptr %dst
+  ret void
+}
+
+define void @and_or_not_combine_v8i32(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvld $xr2, $a1, 0
+; CHECK-NEXT:    xvseq.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvrepli.b $xr3, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr3
+; CHECK-NEXT:    xvseq.w $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvrepli.w $xr1, 4
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <8 x i32>, ptr %pa
+  %b = load <8 x i32>, ptr %pb
+  %v = load <8 x i32>, ptr %pv
+  %ca = icmp ne <8 x i32> %v, %a
+  %cb = icmp ne <8 x i32> %v, %b
+  %or = or <8 x i1> %ca, %cb
+  %ext = sext <8 x i1> %or to <8 x i32>
+  %and = and <8 x i32> %ext, splat (i32 4)
+  store <8 x i32> %and, ptr %dst
+  ret void
+}
+
+define void @and_or_not_combine_v4i64(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvld $xr2, $a1, 0
+; CHECK-NEXT:    xvseq.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvrepli.b $xr3, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr3
+; CHECK-NEXT:    xvseq.d $xr1, $xr1, $xr2
+; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvrepli.d $xr1, 4
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <4 x i64>, ptr %pa
+  %b = load <4 x i64>, ptr %pb
+  %v = load <4 x i64>, ptr %pv
+  %ca = icmp ne <4 x i64> %v, %a
+  %cb = icmp ne <4 x i64> %v, %b
+  %or = or <4 x i1> %ca, %cb
+  %ext = sext <4 x i1> %or to <4 x i64>
+  %and = and <4 x i64> %ext, splat (i64 4)
+  store <4 x i64> %and, ptr %dst
+  ret void
+}
+
+define void @and_extract_subvector_not_combine_v32i8(ptr %pa, ptr %dst) nounwind {
+; CHECK-LABEL: and_extract_subvector_not_combine_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vandi.b $vr0, $vr0, 4
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %a = load volatile <32 x i8>, ptr %pa
+  %a.not = xor <32 x i8> %a, splat (i8 -1)
+  %subv = shufflevector <32 x i8> %a.not, <32 x i8> poison,
+                        <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                    i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %and = and <16 x i8> %subv, splat (i8 4)
+  store <16 x i8> %and, ptr %dst
+  ret void
+}
+
+define void @and_extract_subvector_not_combine_v16i16(ptr %pa, ptr %dst) nounwind {
+; CHECK-LABEL: and_extract_subvector_not_combine_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vrepli.h $vr1, 4
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %a = load volatile <16 x i16>, ptr %pa
+  %a.not = xor <16 x i16> %a, splat (i16 -1)
+  %subv = shufflevector <16 x i16> %a.not, <16 x i16> poison,
+                        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %and = and <8 x i16> %subv, splat (i16 4)
+  store <8 x i16> %and, ptr %dst
+  ret void
+}
+
+define void @and_extract_subvector_not_combine_v8i32(ptr %pa, ptr %dst) nounwind {
+; CHECK-LABEL: and_extract_subvector_not_combine_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vrepli.w $vr1, 4
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %a = load volatile <8 x i32>, ptr %pa
+  %a.not = xor <8 x i32> %a, splat (i32 -1)
+  %subv = shufflevector <8 x i32> %a.not, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %and = and <4 x i32> %subv, splat (i32 4)
+  store <4 x i32> %and, ptr %dst
+  ret void
+}
+
+define void @and_extract_subvector_not_combine_v4i64(ptr %pa, ptr %dst) nounwind {
+; CHECK-LABEL: and_extract_subvector_not_combine_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vrepli.d $vr1, 4
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a1, 0
+; CHECK-NEXT:    ret
+  %a = load volatile <4 x i64>, ptr %pa
+  %a.not = xor <4 x i64> %a, splat (i64 -1)
+  %subv = shufflevector <4 x i64> %a.not, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %and = and <2 x i64> %subv, splat (i64 4)
+  store <2 x i64> %and, ptr %dst
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll
index 3c6d34505e114..960d8c4b156b5 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64
 
 define void @and_not_combine_v16i8(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
 ; CHECK-LABEL: and_not_combine_v16i8:
@@ -85,3 +85,348 @@ entry:
   store <2 x i64> %and, ptr %res
   ret void
 }
+
+define void @pre_not_and_not_combine_v16i8(ptr %res, ptr %a, i8 %b) nounwind {
+; CHECK-LABEL: pre_not_and_not_combine_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    nor $a1, $a2, $zero
+; CHECK-NEXT:    vreplgr2vr.b $vr1, $a1
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i8>, ptr %a
+  %b.not = xor i8 %b, -1
+  %b.not.ele = insertelement <16 x i8> poison, i8 %b.not, i64 0
+  %v1.not = shufflevector <16 x i8> %b.not.ele, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v0.not = xor <16 x i8> %v0, splat (i8 -1)
+  %and = and <16 x i8> %v0.not, %v1.not
+  store <16 x i8> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v16i8(ptr %res, ptr %a, i8 %b) nounwind {
+; CHECK-LABEL: post_not_and_not_combine_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vreplgr2vr.b $vr1, $a2
+; CHECK-NEXT:    vxori.b $vr1, $vr1, 255
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i8>, ptr %a
+  %b.ele = insertelement <16 x i8> poison, i8 %b, i64 0
+  %v1 = shufflevector <16 x i8> %b.ele, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v0.not = xor <16 x i8> %v0, splat (i8 -1)
+  %v1.not = xor <16 x i8> %v1, splat (i8 -1)
+  %and = and <16 x i8> %v0.not, %v1.not
+  store <16 x i8> %and, ptr %res
+  ret void
+}
+
+define void @pre_not_and_not_combine_v8i16(ptr %res, ptr %a, i16 %b) nounwind {
+; CHECK-LABEL: pre_not_and_not_combine_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    nor $a1, $a2, $zero
+; CHECK-NEXT:    vreplgr2vr.h $vr1, $a1
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i16>, ptr %a
+  %b.not = xor i16 %b, -1
+  %b.not.ele = insertelement <8 x i16> poison, i16 %b.not, i64 0
+  %v1.not = shufflevector <8 x i16> %b.not.ele, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v0.not = xor <8 x i16> %v0, splat (i16 -1)
+  %and = and <8 x i16> %v0.not, %v1.not
+  store <8 x i16> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v8i16(ptr %res, ptr %a, i16 %b) nounwind {
+; CHECK-LABEL: post_not_and_not_combine_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vreplgr2vr.h $vr1, $a2
+; CHECK-NEXT:    vrepli.b $vr2, -1
+; CHECK-NEXT:    vxor.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i16>, ptr %a
+  %b.ele = insertelement <8 x i16> poison, i16 %b, i64 0
+  %v1 = shufflevector <8 x i16> %b.ele, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v0.not = xor <8 x i16> %v0, splat (i16 -1)
+  %v1.not = xor <8 x i16> %v1, splat (i16 -1)
+  %and = and <8 x i16> %v0.not, %v1.not
+  store <8 x i16> %and, ptr %res
+  ret void
+}
+
+define void @pre_not_and_not_combine_v4i32(ptr %res, ptr %a, i32 %b) nounwind {
+; CHECK-LABEL: pre_not_and_not_combine_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    nor $a1, $a2, $zero
+; CHECK-NEXT:    vreplgr2vr.w $vr1, $a1
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %a
+  %b.not = xor i32 %b, -1
+  %b.not.ele = insertelement <4 x i32> poison, i32 %b.not, i64 0
+  %v1.not = shufflevector <4 x i32> %b.not.ele, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v0.not = xor <4 x i32> %v0, splat (i32 -1)
+  %and = and <4 x i32> %v0.not, %v1.not
+  store <4 x i32> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v4i32(ptr %res, ptr %a, i32 %b) nounwind {
+; CHECK-LABEL: post_not_and_not_combine_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vreplgr2vr.w $vr1, $a2
+; CHECK-NEXT:    vrepli.b $vr2, -1
+; CHECK-NEXT:    vxor.v $vr1, $vr1, $vr2
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %a
+  %b.ele = insertelement <4 x i32> poison, i32 %b, i64 0
+  %v1 = shufflevector <4 x i32> %b.ele, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v0.not = xor <4 x i32> %v0, splat (i32 -1)
+  %v1.not = xor <4 x i32> %v1, splat (i32 -1)
+  %and = and <4 x i32> %v0.not, %v1.not
+  store <4 x i32> %and, ptr %res
+  ret void
+}
+
+define void @pre_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind {
+; LA32-LABEL: pre_not_and_not_combine_v2i64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    nor $a1, $a3, $zero
+; LA32-NEXT:    nor $a2, $a2, $zero
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
+; LA32-NEXT:    vreplvei.d $vr1, $vr1, 0
+; LA32-NEXT:    vandn.v $vr0, $vr0, $vr1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: pre_not_and_not_combine_v2i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    nor $a1, $a2, $zero
+; LA64-NEXT:    vreplgr2vr.d $vr1, $a1
+; LA64-NEXT:    vandn.v $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+  %v0 = load <2 x i64>, ptr %a
+  %b.not = xor i64 %b, -1
+  %b.not.ele = insertelement <2 x i64> poison, i64 %b.not, i64 0
+  %v1.not = shufflevector <2 x i64> %b.not.ele, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v0.not = xor <2 x i64> %v0, splat (i64 -1)
+  %and = and <2 x i64> %v0.not, %v1.not
+  store <2 x i64> %and, ptr %res
+  ret void
+}
+
+define void @post_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind {
+; LA32-LABEL: post_not_and_not_combine_v2i64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    vld $vr0, $a1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 1
+; LA32-NEXT:    vreplvei.d $vr1, $vr1, 0
+; LA32-NEXT:    vrepli.b $vr2, -1
+; LA32-NEXT:    vxor.v $vr1, $vr1, $vr2
+; LA32-NEXT:    vandn.v $vr0, $vr0, $vr1
+; LA32-NEXT:    vst $vr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: post_not_and_not_combine_v2i64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    vld $vr0, $a1, 0
+; LA64-NEXT:    vreplgr2vr.d $vr1, $a2
+; LA64-NEXT:    vrepli.b $vr2, -1
+; LA64-NEXT:    vxor.v $vr1, $vr1, $vr2
+; LA64-NEXT:    vandn.v $vr0, $vr0, $vr1
+; LA64-NEXT:    vst $vr0, $a0, 0
+; LA64-NEXT:    ret
+  %v0 = load <2 x i64>, ptr %a
+  %b.ele = insertelement <2 x i64> poison, i64 %b, i64 0
+  %v1 = shufflevector <2 x i64> %b.ele, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v0.not = xor <2 x i64> %v0, splat (i64 -1)
+  %v1.not = xor <2 x i64> %v1, splat (i64 -1)
+  %and = and <2 x i64> %v0.not, %v1.not
+  store <2 x i64> %and, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v16i8(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.b $vr1, -4
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i8>, ptr %a0
+  %and = and <16 x i8> %v0, splat (i8 -4)
+  %xor = xor <16 x i8> %and, splat (i8 -4)
+  store <16 x i8> %xor, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v8i16(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.h $vr1, -4
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i16>, ptr %a0
+  %and = and <8 x i16> %v0, splat (i16 -4)
+  %xor = xor <8 x i16> %and, splat (i16 -4)
+  store <8 x i16> %xor, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v4i32(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.w $vr1, -4
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %a0
+  %and = and <4 x i32> %v0, splat (i32 -4)
+  %xor = xor <4 x i32> %and, splat (i32 -4)
+  store <4 x i32> %xor, ptr %res
+  ret void
+}
+
+define void @and_not_combine_splatimm_v2i64(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: and_not_combine_splatimm_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.d $vr1, -4
+; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <2 x i64>, ptr %a0
+  %and = and <2 x i64> %v0, splat (i64 -4)
+  %xor = xor <2 x i64> %and, splat (i64 -4)
+  store <2 x i64> %xor, ptr %res
+  ret void
+}
+
+define void @and_or_not_combine_v16i8(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vld $vr2, $a1, 0
+; CHECK-NEXT:    vseq.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    vxori.b $vr0, $vr0, 255
+; CHECK-NEXT:    vseq.b $vr1, $vr1, $vr2
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vandi.b $vr0, $vr0, 4
+; CHECK-NEXT:    vst $vr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <16 x i8>, ptr %pa
+  %b = load <16 x i8>, ptr %pb
+  %v = load <16 x i8>, ptr %pv
+  %ca = icmp ne <16 x i8> %v, %a
+  %cb = icmp ne <16 x i8> %v, %b
+  %or = or <16 x i1> %ca, %cb
+  %ext = sext <16 x i1> %or to <16 x i8>
+  %and = and <16 x i8> %ext, splat (i8 4)
+  store <16 x i8> %and, ptr %dst
+  ret void
+}
+
+define void @and_or_not_combine_v8i16(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vld $vr2, $a1, 0
+; CHECK-NEXT:    vseq.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    vrepli.b $vr3, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr3
+; CHECK-NEXT:    vseq.h $vr1, $vr1, $vr2
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vrepli.h $vr1, 4
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <8 x i16>, ptr %pa
+  %b = load <8 x i16>, ptr %pb
+  %v = load <8 x i16>, ptr %pv
+  %ca = icmp ne <8 x i16> %v, %a
+  %cb = icmp ne <8 x i16> %v, %b
+  %or = or <8 x i1> %ca, %cb
+  %ext = sext <8 x i1> %or to <8 x i16>
+  %and = and <8 x i16> %ext, splat (i16 4)
+  store <8 x i16> %and, ptr %dst
+  ret void
+}
+
+define void @and_or_not_combine_v4i32(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vld $vr2, $a1, 0
+; CHECK-NEXT:    vseq.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    vrepli.b $vr3, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr3
+; CHECK-NEXT:    vseq.w $vr1, $vr1, $vr2
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vrepli.w $vr1, 4
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <4 x i32>, ptr %pa
+  %b = load <4 x i32>, ptr %pb
+  %v = load <4 x i32>, ptr %pv
+  %ca = icmp ne <4 x i32> %v, %a
+  %cb = icmp ne <4 x i32> %v, %b
+  %or = or <4 x i1> %ca, %cb
+  %ext = sext <4 x i1> %or to <4 x i32>
+  %and = and <4 x i32> %ext, splat (i32 4)
+  store <4 x i32> %and, ptr %dst
+  ret void
+}
+
+define void @and_or_not_combine_v2i64(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind {
+; CHECK-LABEL: and_or_not_combine_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vld $vr2, $a1, 0
+; CHECK-NEXT:    vseq.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    vrepli.b $vr3, -1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr3
+; CHECK-NEXT:    vseq.d $vr1, $vr1, $vr2
+; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vrepli.d $vr1, 4
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a3, 0
+; CHECK-NEXT:    ret
+  %a = load <2 x i64>, ptr %pa
+  %b = load <2 x i64>, ptr %pb
+  %v = load <2 x i64>, ptr %pv
+  %ca = icmp ne <2 x i64> %v, %a
+  %cb = icmp ne <2 x i64> %v, %b
+  %or = or <2 x i1> %ca, %cb
+  %ext = sext <2 x i1> %or to <2 x i64>
+  %and = and <2 x i64> %ext, splat (i64 4)
+  store <2 x i64> %and, ptr %dst
+  ret void
+}

From d124675e27a6abbce0bfea6a25ab9dfe66e9d657 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Mon, 24 Nov 2025 07:09:39 +0100
Subject: [PATCH 04/38] [mlir][x86vector] Add missing Linalg dependency
 (#169280)

Adds required dependency for `inferContractionDims`.

Fixes #168074
---
 mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
index 3d2288049e49e..2cab50fb591c4 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
@@ -8,6 +8,7 @@ add_mlir_dialect_library(MLIRX86VectorTransforms
   MLIRArithDialect
   MLIRX86VectorDialect
   MLIRIR
+  MLIRLinalgDialect
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
   MLIRVectorDialect

From 54db657b9ebdbce70f902313e6b303d85d68a4dc Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 23 Nov 2025 22:32:43 -0800
Subject: [PATCH 05/38] [StaticAnalyzer] Use llvm::find_if (NFC) (#169237)

Identified with llvm-use-ranges.
---
 clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp
index 6673f2f319c0e..aafd8d45537e3 100644
--- a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp
@@ -202,11 +202,10 @@ SarifDiagnostics::createResult(const PathDiagnostic *Diag,
     // Find the HTML report that was generated for this issue, if one exists.
     PDFileEntry::ConsumerFiles *Files = FM->getFiles(*Diag);
     if (Files) {
-      auto HtmlFile =
-          std::find_if(Files->cbegin(), Files->cend(), [](auto &File) {
-            return File.first == HTML_DIAGNOSTICS_NAME;
-          });
-      if (HtmlFile != Files->cend()) {
+      auto HtmlFile = llvm::find_if(*Files, [](const auto &File) {
+        return File.first == HTML_DIAGNOSTICS_NAME;
+      });
+      if (HtmlFile != Files->end()) {
         SmallString<128> HtmlReportPath =
             llvm::sys::path::parent_path(OutputFile);
         llvm::sys::path::append(HtmlReportPath, HtmlFile->second);

From 67391fc039b27f4e82624a6de4493cdd0907878b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 23 Nov 2025 22:32:50 -0800
Subject: [PATCH 06/38] [mlir] Construct SmallVector with initial values (NFC)
 (#169239)

Identified with llvm-use-ranges.
---
 .../XeGPU/Transforms/XeGPUSubgroupDistribute.cpp       | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 4455811a2e681..b64eb5b29ccb0 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -989,9 +989,8 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Value> newOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
 
-    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
-    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
-              ShapedType::kDynamic);
+    SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
+                                         ShapedType::kDynamic);
     DenseI64ArrayAttr newConstOffsetsAttr =
         rewriter.getDenseI64ArrayAttr(newConstOffsets);
     ValueRange currentOffsets =
@@ -1066,9 +1065,8 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
     SmallVector<Value> newOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
 
-    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
-    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
-              ShapedType::kDynamic);
+    SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
+                                         ShapedType::kDynamic);
     DenseI64ArrayAttr newConstOffsetsAttr =
         rewriter.getDenseI64ArrayAttr(newConstOffsets);
     ValueRange currentOffsets =

From 2b81e9e8fea0cdb2eac1537c1f882b695615b141 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 23 Nov 2025 22:32:58 -0800
Subject: [PATCH 07/38] [Orc] Use a range-based for loop (NFC) (#169240)

Identified with modernize-loop-convert.
---
 llvm/lib/ExecutionEngine/Orc/Core.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index d029ac587fb9a..9233c8ed8d0dd 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -713,12 +713,7 @@ JITDylib::defineMaterializing(MaterializationResponsibility &FromMR,
     std::vector<NonOwningSymbolStringPtr> AddedSyms;
     std::vector<NonOwningSymbolStringPtr> RejectedWeakDefs;
 
-    for (auto SFItr = SymbolFlags.begin(), SFEnd = SymbolFlags.end();
-         SFItr != SFEnd; ++SFItr) {
-
-      auto &Name = SFItr->first;
-      auto &Flags = SFItr->second;
-
+    for (auto &[Name, Flags] : SymbolFlags) {
       auto EntryItr = Symbols.find(Name);
 
       // If the entry already exists...

From 7dd531f428614a310b6715fe9181432393d9095b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 23 Nov 2025 22:33:06 -0800
Subject: [PATCH 08/38] [SPIRV] Use range-based for loops (NFC) (#169241)

Identified with modernize-loop-convert.
---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index bd754d17694b8..00f750b88a608 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -494,8 +494,8 @@ MCRegister SPIRVModuleAnalysis::handleVariable(
 void SPIRVModuleAnalysis::collectDeclarations(const Module &M) {
   InstrGRegsMap SignatureToGReg;
   std::map<const Value *, unsigned> GlobalToGReg;
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI->getMachineFunction(F);
     if (!MF)
       continue;
     const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -634,10 +634,10 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
 // be correctly collected until these registers are globally numbered.
 void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
   InstrTraces IS;
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration())
+  for (const Function &F : M) {
+    if (F.isDeclaration())
       continue;
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+    MachineFunction *MF = MMI->getMachineFunction(F);
     assert(MF);
 
     for (MachineBasicBlock &MBB : *MF)
@@ -669,13 +669,13 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
           collectOtherInstr(MI, MAI, SPIRV::MB_AliasingInsts, IS);
         } else if (TII->isDecorationInstr(MI)) {
           collectOtherInstr(MI, MAI, SPIRV::MB_Annotations, IS);
-          collectFuncNames(MI, &*F);
+          collectFuncNames(MI, &F);
         } else if (TII->isConstantInstr(MI)) {
           // Now OpSpecConstant*s are not in DT,
           // but they need to be collected anyway.
           collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, IS);
         } else if (OpCode == SPIRV::OpFunction) {
-          collectFuncNames(MI, &*F);
+          collectFuncNames(MI, &F);
         } else if (OpCode == SPIRV::OpTypeForwardPointer) {
           collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, IS, false);
         }
@@ -687,10 +687,10 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
 // the result in global register alias table. Some registers are already
 // numbered.
 void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    if ((*F).isDeclaration())
+  for (const Function &F : M) {
+    if (F.isDeclaration())
       continue;
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+    MachineFunction *MF = MMI->getMachineFunction(F);
     assert(MF);
     for (MachineBasicBlock &MBB : *MF) {
       for (MachineInstr &MI : MBB) {
@@ -2169,8 +2169,8 @@ void addInstrRequirements(const MachineInstr &MI,
 static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
                         MachineModuleInfo *MMI, const SPIRVSubtarget &ST) {
   // Collect requirements for existing instructions.
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI->getMachineFunction(F);
     if (!MF)
       continue;
     for (const MachineBasicBlock &MBB : *MF)
@@ -2250,8 +2250,7 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
     if (RequireKHRFloatControls2)
       MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls2);
   }
-  for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
-    const Function &F = *FI;
+  for (const Function &F : M) {
     if (F.isDeclaration())
       continue;
     if (F.getMetadata("reqd_work_group_size"))
@@ -2431,23 +2430,23 @@ static void addDecorations(const Module &M, const SPIRVInstrInfo &TII,
                            MachineModuleInfo *MMI, const SPIRVSubtarget &ST,
                            SPIRV::ModuleAnalysisInfo &MAI,
                            const SPIRVGlobalRegistry *GR) {
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI->getMachineFunction(F);
     if (!MF)
       continue;
 
     for (auto &MBB : *MF)
       for (auto &MI : MBB)
         handleMIFlagDecoration(MI, ST, TII, MAI.Reqs, GR,
-                               MAI.FPFastMathDefaultInfoMap[&(*F)]);
+                               MAI.FPFastMathDefaultInfoMap[&F]);
   }
 }
 
 static void addMBBNames(const Module &M, const SPIRVInstrInfo &TII,
                         MachineModuleInfo *MMI, const SPIRVSubtarget &ST,
                         SPIRV::ModuleAnalysisInfo &MAI) {
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI->getMachineFunction(F);
     if (!MF)
       continue;
     MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -2467,8 +2466,8 @@ static void addMBBNames(const Module &M, const SPIRVInstrInfo &TII,
 // patching Instruction::PHI to SPIRV::OpPhi
 static void patchPhis(const Module &M, SPIRVGlobalRegistry *GR,
                       const SPIRVInstrInfo &TII, MachineModuleInfo *MMI) {
-  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    MachineFunction *MF = MMI->getMachineFunction(*F);
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI->getMachineFunction(F);
     if (!MF)
       continue;
     for (auto &MBB : *MF) {

From 9ce6fadbcaf60ed88302617b6301f68989d44e3e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 23 Nov 2025 22:33:13 -0800
Subject: [PATCH 09/38] [AST] Construct iterator_range with the conversion
 constructor (NFC) (#169245)

This patch simplifies iterator_range construction with the conversion
constructor.
---
 clang/include/clang/AST/OpenMPClause.h | 328 ++++++-------------------
 1 file changed, 79 insertions(+), 249 deletions(-)

diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 7972d05bedbf7..72c5efde7449b 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -313,12 +313,8 @@ template <class T> class OMPVarListClause : public OMPClause {
   unsigned varlist_size() const { return NumVars; }
   bool varlist_empty() const { return NumVars == 0; }
 
-  varlist_range varlist() {
-    return varlist_range(varlist_begin(), varlist_end());
-  }
-  varlist_const_range varlist() const {
-    return varlist_const_range(varlist_begin(), varlist_end());
-  }
+  varlist_range varlist() { return getVarRefs(); }
+  varlist_const_range varlist() const { return getVarRefs(); }
 
   varlist_iterator varlist_begin() { return getVarRefs().begin(); }
   varlist_iterator varlist_end() { return getVarRefs().end(); }
@@ -3404,14 +3400,10 @@ class OMPPrivateClause final
   using private_copies_const_range =
       llvm::iterator_range<private_copies_const_iterator>;
 
-  private_copies_range private_copies() {
-    return private_copies_range(getPrivateCopies().begin(),
-                                getPrivateCopies().end());
-  }
+  private_copies_range private_copies() { return getPrivateCopies(); }
 
   private_copies_const_range private_copies() const {
-    return private_copies_const_range(getPrivateCopies().begin(),
-                                      getPrivateCopies().end());
+    return getPrivateCopies();
   }
 
   child_range children() {
@@ -3531,13 +3523,9 @@ class OMPFirstprivateClause final
   using private_copies_const_range =
       llvm::iterator_range<private_copies_const_iterator>;
 
-  private_copies_range private_copies() {
-    return private_copies_range(getPrivateCopies().begin(),
-                                getPrivateCopies().end());
-  }
+  private_copies_range private_copies() { return getPrivateCopies(); }
   private_copies_const_range private_copies() const {
-    return private_copies_const_range(getPrivateCopies().begin(),
-                                      getPrivateCopies().end());
+    return getPrivateCopies();
   }
 
   using inits_iterator = MutableArrayRef<Expr *>::iterator;
@@ -3545,12 +3533,8 @@ class OMPFirstprivateClause final
   using inits_range = llvm::iterator_range<inits_iterator>;
   using inits_const_range = llvm::iterator_range<inits_const_iterator>;
 
-  inits_range inits() {
-    return inits_range(getInits().begin(), getInits().end());
-  }
-  inits_const_range inits() const {
-    return inits_const_range(getInits().begin(), getInits().end());
-  }
+  inits_range inits() { return getInits(); }
+  inits_const_range inits() const { return getInits(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -3752,44 +3736,23 @@ class OMPLastprivateClause final
   /// copies of original lastprivate variables.
   void setPrivateCopies(ArrayRef<Expr *> PrivateCopies);
 
-  helper_expr_const_range private_copies() const {
-    return helper_expr_const_range(getPrivateCopies().begin(),
-                                   getPrivateCopies().end());
-  }
+  helper_expr_const_range private_copies() const { return getPrivateCopies(); }
 
-  helper_expr_range private_copies() {
-    return helper_expr_range(getPrivateCopies().begin(),
-                             getPrivateCopies().end());
-  }
+  helper_expr_range private_copies() { return getPrivateCopies(); }
 
-  helper_expr_const_range source_exprs() const {
-    return helper_expr_const_range(getSourceExprs().begin(),
-                                   getSourceExprs().end());
-  }
+  helper_expr_const_range source_exprs() const { return getSourceExprs(); }
 
-  helper_expr_range source_exprs() {
-    return helper_expr_range(getSourceExprs().begin(), getSourceExprs().end());
-  }
+  helper_expr_range source_exprs() { return getSourceExprs(); }
 
   helper_expr_const_range destination_exprs() const {
-    return helper_expr_const_range(getDestinationExprs().begin(),
-                                   getDestinationExprs().end());
+    return getDestinationExprs();
   }
 
-  helper_expr_range destination_exprs() {
-    return helper_expr_range(getDestinationExprs().begin(),
-                             getDestinationExprs().end());
-  }
+  helper_expr_range destination_exprs() { return getDestinationExprs(); }
 
-  helper_expr_const_range assignment_ops() const {
-    return helper_expr_const_range(getAssignmentOps().begin(),
-                                   getAssignmentOps().end());
-  }
+  helper_expr_const_range assignment_ops() const { return getAssignmentOps(); }
 
-  helper_expr_range assignment_ops() {
-    return helper_expr_range(getAssignmentOps().begin(),
-                             getAssignmentOps().end());
-  }
+  helper_expr_range assignment_ops() { return getAssignmentOps(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -4178,79 +4141,45 @@ class OMPReductionClause final
   using helper_flag_const_range =
       llvm::iterator_range<helper_flag_const_iterator>;
 
-  helper_expr_const_range privates() const {
-    return helper_expr_const_range(getPrivates().begin(), getPrivates().end());
-  }
+  helper_expr_const_range privates() const { return getPrivates(); }
 
-  helper_expr_range privates() {
-    return helper_expr_range(getPrivates().begin(), getPrivates().end());
-  }
+  helper_expr_range privates() { return getPrivates(); }
 
-  helper_expr_const_range lhs_exprs() const {
-    return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end());
-  }
+  helper_expr_const_range lhs_exprs() const { return getLHSExprs(); }
 
-  helper_expr_range lhs_exprs() {
-    return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end());
-  }
+  helper_expr_range lhs_exprs() { return getLHSExprs(); }
 
-  helper_expr_const_range rhs_exprs() const {
-    return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end());
-  }
+  helper_expr_const_range rhs_exprs() const { return getRHSExprs(); }
 
-  helper_expr_range rhs_exprs() {
-    return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end());
-  }
+  helper_expr_range rhs_exprs() { return getRHSExprs(); }
 
   helper_flag_const_range private_var_reduction_flags() const {
-    return helper_flag_const_range(getPrivateVariableReductionFlags().begin(),
-                                   getPrivateVariableReductionFlags().end());
+    return getPrivateVariableReductionFlags();
   }
 
   helper_flag_range private_var_reduction_flags() {
-    return helper_flag_range(getPrivateVariableReductionFlags().begin(),
-                             getPrivateVariableReductionFlags().end());
+    return getPrivateVariableReductionFlags();
   }
 
-  helper_expr_const_range reduction_ops() const {
-    return helper_expr_const_range(getReductionOps().begin(),
-                                   getReductionOps().end());
-  }
+  helper_expr_const_range reduction_ops() const { return getReductionOps(); }
 
-  helper_expr_range reduction_ops() {
-    return helper_expr_range(getReductionOps().begin(),
-                             getReductionOps().end());
-  }
+  helper_expr_range reduction_ops() { return getReductionOps(); }
 
-  helper_expr_const_range copy_ops() const {
-    return helper_expr_const_range(getInscanCopyOps().begin(),
-                                   getInscanCopyOps().end());
-  }
+  helper_expr_const_range copy_ops() const { return getInscanCopyOps(); }
 
-  helper_expr_range copy_ops() {
-    return helper_expr_range(getInscanCopyOps().begin(),
-                             getInscanCopyOps().end());
-  }
+  helper_expr_range copy_ops() { return getInscanCopyOps(); }
 
   helper_expr_const_range copy_array_temps() const {
-    return helper_expr_const_range(getInscanCopyArrayTemps().begin(),
-                                   getInscanCopyArrayTemps().end());
+    return getInscanCopyArrayTemps();
   }
 
-  helper_expr_range copy_array_temps() {
-    return helper_expr_range(getInscanCopyArrayTemps().begin(),
-                             getInscanCopyArrayTemps().end());
-  }
+  helper_expr_range copy_array_temps() { return getInscanCopyArrayTemps(); }
 
   helper_expr_const_range copy_array_elems() const {
-    return helper_expr_const_range(getInscanCopyArrayElems().begin(),
-                                   getInscanCopyArrayElems().end());
+    return getInscanCopyArrayElems();
   }
 
-  helper_expr_range copy_array_elems() {
-    return helper_expr_range(getInscanCopyArrayElems().begin(),
-                             getInscanCopyArrayElems().end());
-  }
+  helper_expr_range copy_array_elems() { return getInscanCopyArrayElems(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -4450,39 +4379,21 @@ class OMPTaskReductionClause final
   using helper_expr_const_range =
       llvm::iterator_range<helper_expr_const_iterator>;
 
-  helper_expr_const_range privates() const {
-    return helper_expr_const_range(getPrivates().begin(), getPrivates().end());
-  }
+  helper_expr_const_range privates() const { return getPrivates(); }
 
-  helper_expr_range privates() {
-    return helper_expr_range(getPrivates().begin(), getPrivates().end());
-  }
+  helper_expr_range privates() { return getPrivates(); }
 
-  helper_expr_const_range lhs_exprs() const {
-    return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end());
-  }
+  helper_expr_const_range lhs_exprs() const { return getLHSExprs(); }
 
-  helper_expr_range lhs_exprs() {
-    return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end());
-  }
+  helper_expr_range lhs_exprs() { return getLHSExprs(); }
 
-  helper_expr_const_range rhs_exprs() const {
-    return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end());
-  }
+  helper_expr_const_range rhs_exprs() const { return getRHSExprs(); }
 
-  helper_expr_range rhs_exprs() {
-    return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end());
-  }
+  helper_expr_range rhs_exprs() { return getRHSExprs(); }
 
-  helper_expr_const_range reduction_ops() const {
-    return helper_expr_const_range(getReductionOps().begin(),
-                                   getReductionOps().end());
-  }
+  helper_expr_const_range reduction_ops() const { return getReductionOps(); }
 
-  helper_expr_range reduction_ops() {
-    return helper_expr_range(getReductionOps().begin(),
-                             getReductionOps().end());
-  }
+  helper_expr_range reduction_ops() { return getReductionOps(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -4694,48 +4605,28 @@ class OMPInReductionClause final
   using helper_expr_const_range =
       llvm::iterator_range<helper_expr_const_iterator>;
 
-  helper_expr_const_range privates() const {
-    return helper_expr_const_range(getPrivates().begin(), getPrivates().end());
-  }
+  helper_expr_const_range privates() const { return getPrivates(); }
 
-  helper_expr_range privates() {
-    return helper_expr_range(getPrivates().begin(), getPrivates().end());
-  }
+  helper_expr_range privates() { return getPrivates(); }
 
-  helper_expr_const_range lhs_exprs() const {
-    return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end());
-  }
+  helper_expr_const_range lhs_exprs() const { return getLHSExprs(); }
 
-  helper_expr_range lhs_exprs() {
-    return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end());
-  }
+  helper_expr_range lhs_exprs() { return getLHSExprs(); }
 
-  helper_expr_const_range rhs_exprs() const {
-    return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end());
-  }
+  helper_expr_const_range rhs_exprs() const { return getRHSExprs(); }
 
-  helper_expr_range rhs_exprs() {
-    return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end());
-  }
+  helper_expr_range rhs_exprs() { return getRHSExprs(); }
 
-  helper_expr_const_range reduction_ops() const {
-    return helper_expr_const_range(getReductionOps().begin(),
-                                   getReductionOps().end());
-  }
+  helper_expr_const_range reduction_ops() const { return getReductionOps(); }
 
-  helper_expr_range reduction_ops() {
-    return helper_expr_range(getReductionOps().begin(),
-                             getReductionOps().end());
-  }
+  helper_expr_range reduction_ops() { return getReductionOps(); }
 
   helper_expr_const_range taskgroup_descriptors() const {
-    return helper_expr_const_range(getTaskgroupDescriptors().begin(),
-                                   getTaskgroupDescriptors().end());
+    return getTaskgroupDescriptors();
   }
 
   helper_expr_range taskgroup_descriptors() {
-    return helper_expr_range(getTaskgroupDescriptors().begin(),
-                             getTaskgroupDescriptors().end());
+    return getTaskgroupDescriptors();
   }
 
   child_range children() {
@@ -4965,52 +4856,36 @@ class OMPLinearClause final
   using privates_range = llvm::iterator_range<privates_iterator>;
   using privates_const_range = llvm::iterator_range<privates_const_iterator>;
 
-  privates_range privates() {
-    return privates_range(getPrivates().begin(), getPrivates().end());
-  }
+  privates_range privates() { return getPrivates(); }
 
-  privates_const_range privates() const {
-    return privates_const_range(getPrivates().begin(), getPrivates().end());
-  }
+  privates_const_range privates() const { return getPrivates(); }
 
   using inits_iterator = MutableArrayRef<Expr *>::iterator;
   using inits_const_iterator = ArrayRef<const Expr *>::iterator;
   using inits_range = llvm::iterator_range<inits_iterator>;
   using inits_const_range = llvm::iterator_range<inits_const_iterator>;
 
-  inits_range inits() {
-    return inits_range(getInits().begin(), getInits().end());
-  }
+  inits_range inits() { return getInits(); }
 
-  inits_const_range inits() const {
-    return inits_const_range(getInits().begin(), getInits().end());
-  }
+  inits_const_range inits() const { return getInits(); }
 
   using updates_iterator = MutableArrayRef<Expr *>::iterator;
   using updates_const_iterator = ArrayRef<const Expr *>::iterator;
   using updates_range = llvm::iterator_range<updates_iterator>;
   using updates_const_range = llvm::iterator_range<updates_const_iterator>;
 
-  updates_range updates() {
-    return updates_range(getUpdates().begin(), getUpdates().end());
-  }
+  updates_range updates() { return getUpdates(); }
 
-  updates_const_range updates() const {
-    return updates_const_range(getUpdates().begin(), getUpdates().end());
-  }
+  updates_const_range updates() const { return getUpdates(); }
 
   using finals_iterator = MutableArrayRef<Expr *>::iterator;
   using finals_const_iterator = ArrayRef<const Expr *>::iterator;
   using finals_range = llvm::iterator_range<finals_iterator>;
   using finals_const_range = llvm::iterator_range<finals_const_iterator>;
 
-  finals_range finals() {
-    return finals_range(getFinals().begin(), getFinals().end());
-  }
+  finals_range finals() { return getFinals(); }
 
-  finals_const_range finals() const {
-    return finals_const_range(getFinals().begin(), getFinals().end());
-  }
+  finals_const_range finals() const { return getFinals(); }
 
   using used_expressions_iterator = MutableArrayRef<Expr *>::iterator;
   using used_expressions_const_iterator = ArrayRef<const Expr *>::iterator;
@@ -5270,34 +5145,19 @@ class OMPCopyinClause final
   using helper_expr_const_range =
       llvm::iterator_range<helper_expr_const_iterator>;
 
-  helper_expr_const_range source_exprs() const {
-    return helper_expr_const_range(getSourceExprs().begin(),
-                                   getSourceExprs().end());
-  }
+  helper_expr_const_range source_exprs() const { return getSourceExprs(); }
 
-  helper_expr_range source_exprs() {
-    return helper_expr_range(getSourceExprs().begin(), getSourceExprs().end());
-  }
+  helper_expr_range source_exprs() { return getSourceExprs(); }
 
   helper_expr_const_range destination_exprs() const {
-    return helper_expr_const_range(getDestinationExprs().begin(),
-                                   getDestinationExprs().end());
+    return getDestinationExprs();
   }
 
-  helper_expr_range destination_exprs() {
-    return helper_expr_range(getDestinationExprs().begin(),
-                             getDestinationExprs().end());
-  }
+  helper_expr_range destination_exprs() { return getDestinationExprs(); }
 
-  helper_expr_const_range assignment_ops() const {
-    return helper_expr_const_range(getAssignmentOps().begin(),
-                                   getAssignmentOps().end());
-  }
+  helper_expr_const_range assignment_ops() const { return getAssignmentOps(); }
 
-  helper_expr_range assignment_ops() {
-    return helper_expr_range(getAssignmentOps().begin(),
-                             getAssignmentOps().end());
-  }
+  helper_expr_range assignment_ops() { return getAssignmentOps(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -5433,34 +5293,19 @@ class OMPCopyprivateClause final
   using helper_expr_const_range =
       llvm::iterator_range<helper_expr_const_iterator>;
 
-  helper_expr_const_range source_exprs() const {
-    return helper_expr_const_range(getSourceExprs().begin(),
-                                   getSourceExprs().end());
-  }
+  helper_expr_const_range source_exprs() const { return getSourceExprs(); }
 
-  helper_expr_range source_exprs() {
-    return helper_expr_range(getSourceExprs().begin(), getSourceExprs().end());
-  }
+  helper_expr_range source_exprs() { return getSourceExprs(); }
 
   helper_expr_const_range destination_exprs() const {
-    return helper_expr_const_range(getDestinationExprs().begin(),
-                                   getDestinationExprs().end());
+    return getDestinationExprs();
   }
 
-  helper_expr_range destination_exprs() {
-    return helper_expr_range(getDestinationExprs().begin(),
-                             getDestinationExprs().end());
-  }
+  helper_expr_range destination_exprs() { return getDestinationExprs(); }
 
-  helper_expr_const_range assignment_ops() const {
-    return helper_expr_const_range(getAssignmentOps().begin(),
-                                   getAssignmentOps().end());
-  }
+  helper_expr_const_range assignment_ops() const { return getAssignmentOps(); }
 
-  helper_expr_range assignment_ops() {
-    return helper_expr_range(getAssignmentOps().begin(),
-                             getAssignmentOps().end());
-  }
+  helper_expr_range assignment_ops() { return getAssignmentOps(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -6632,18 +6477,14 @@ class OMPMappableExprListClause : public OMPVarListClause<T>,
   using const_all_decls_iterator = ArrayRef<ValueDecl *>::iterator;
   using const_all_decls_range = llvm::iterator_range<const_all_decls_iterator>;
 
-  const_all_decls_range all_decls() const {
-    auto A = getUniqueDeclsRef();
-    return const_all_decls_range(A.begin(), A.end());
-  }
+  const_all_decls_range all_decls() const { return getUniqueDeclsRef(); }
 
   using const_all_num_lists_iterator = ArrayRef<unsigned>::iterator;
   using const_all_num_lists_range =
       llvm::iterator_range<const_all_num_lists_iterator>;
 
   const_all_num_lists_range all_num_lists() const {
-    auto A = getDeclNumListsRef();
-    return const_all_num_lists_range(A.begin(), A.end());
+    return getDeclNumListsRef();
   }
 
   using const_all_lists_sizes_iterator = ArrayRef<unsigned>::iterator;
@@ -6651,8 +6492,7 @@ class OMPMappableExprListClause : public OMPVarListClause<T>,
       llvm::iterator_range<const_all_lists_sizes_iterator>;
 
   const_all_lists_sizes_range all_lists_sizes() const {
-    auto A = getComponentListSizesRef();
-    return const_all_lists_sizes_range(A.begin(), A.end());
+    return getComponentListSizesRef();
   }
 
   using const_all_components_iterator = ArrayRef<MappableComponent>::iterator;
@@ -6660,8 +6500,7 @@ class OMPMappableExprListClause : public OMPVarListClause<T>,
       llvm::iterator_range<const_all_components_iterator>;
 
   const_all_components_range all_components() const {
-    auto A = getComponentsRef();
-    return const_all_components_range(A.begin(), A.end());
+    return getComponentsRef();
   }
 
   using mapperlist_iterator = MutableArrayRef<Expr *>::iterator;
@@ -8241,14 +8080,10 @@ class OMPUseDevicePtrClause final
   using private_copies_const_range =
       llvm::iterator_range<private_copies_const_iterator>;
 
-  private_copies_range private_copies() {
-    return private_copies_range(getPrivateCopies().begin(),
-                                getPrivateCopies().end());
-  }
+  private_copies_range private_copies() { return getPrivateCopies(); }
 
   private_copies_const_range private_copies() const {
-    return private_copies_const_range(getPrivateCopies().begin(),
-                                      getPrivateCopies().end());
+    return getPrivateCopies();
   }
 
   using inits_iterator = MutableArrayRef<Expr *>::iterator;
@@ -8256,13 +8091,9 @@ class OMPUseDevicePtrClause final
   using inits_range = llvm::iterator_range<inits_iterator>;
   using inits_const_range = llvm::iterator_range<inits_const_iterator>;
 
-  inits_range inits() {
-    return inits_range(getInits().begin(), getInits().end());
-  }
+  inits_range inits() { return getInits(); }
 
-  inits_const_range inits() const {
-    return inits_const_range(getInits().begin(), getInits().end());
-  }
+  inits_const_range inits() const { return getInits(); }
 
   child_range children() {
     return child_range(reinterpret_cast<Stmt **>(varlist_begin()),
@@ -8904,8 +8735,7 @@ class OMPInitClause final
   }
 
   const_prefs_range prefs() const {
-    auto Prefs = const_cast<OMPInitClause *>(this)->prefs();
-    return const_prefs_range(Prefs.begin(), Prefs.end());
+    return const_prefs_range(const_cast<OMPInitClause *>(this)->prefs());
   }
 
   static bool classof(const OMPClause *T) {

From 95f0fab7fab48bbf37d3c02c0ea8b01ca73c30dd Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 24 Nov 2025 07:34:48 +0100
Subject: [PATCH 10/38] [clang][bytecode] Fix conditional operator scoping wrt.
 local variables (#169030)

We used to create a scope for the true- and false expression of a
conditional operator. This was done so e.g. in this example:

```c++
  struct A { constexpr A(){}; ~A(); constexpr int get() { return 10; } }; // all-note 2{{declared here}}
  static_assert( (false ? A().get() : 1) == 1);
```

we did _not_ evaluate the true branch at all, meaning we did not
register the local variable for the temporary of type `A`, which means
we also didn't call it destructor.

However, this breaks the case where the temporary needs to outlive the
conditional operator and instead be destroyed via the surrounding
`ExprWithCleanups`:
```
constexpr bool test2(bool b) {
  unsigned long __ms = b ? (const unsigned long &)0 : __ms;
  return true;
}
static_assert(test2(true));
```
Before this patch, we diagnosed this example:
```console
./array.cpp:180:15: error: static assertion expression is not an integral constant expression
  180 | static_assert(test2(true));
      |               ^~~~~~~~~~~
./array.cpp:177:24: note: read of temporary whose lifetime has ended
  177 |   unsigned long __ms = b ? (const unsigned long &)0 : __ms;
      |                        ^
./array.cpp:180:15: note: in call to 'test2(true)'
  180 | static_assert(test2(true));
      |               ^~~~~~~~~~~
./array.cpp:177:51: note: temporary created here
  177 |   unsigned long __ms = b ? (const unsigned long &)0 : __ms;
      |                                                   ^
1 error generated.
```
because the temporary created for the true branch got immediately
destroyed.

The problem in essence is that since the conditional operator doesn't
create a scope at all, we register the local variables for both its
branches, but we later only execute one of them, which means we should
also only destroy the locals of one of the branches.

We fix this similar to clang codgen's `is_active` flag: In the case of a
conditional operator (which is so far the only case where this is
problematic, and this also helps minimize the performance impact of this
change), we make local variables as disabled-by-default and then emit a
`EnableLocal` opcode later, which marks them as enabled. The code
calling their destructors checks whether the local was enabled at all.
---
 clang/lib/AST/ByteCode/ByteCodeEmitter.h |  2 +-
 clang/lib/AST/ByteCode/Compiler.cpp      | 34 +++++++++++++++------
 clang/lib/AST/ByteCode/Compiler.h        | 39 +++++++++++++++++++-----
 clang/lib/AST/ByteCode/EvalEmitter.cpp   | 29 +++++++++++++++++-
 clang/lib/AST/ByteCode/Function.h        |  2 ++
 clang/lib/AST/ByteCode/Interp.h          | 12 ++++++++
 clang/lib/AST/ByteCode/InterpFrame.cpp   | 12 ++++++++
 clang/lib/AST/ByteCode/InterpFrame.h     |  4 +++
 clang/lib/AST/ByteCode/Opcodes.td        | 10 ++++++
 clang/test/AST/ByteCode/cxx23.cpp        | 23 ++++++++++++++
 10 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.h b/clang/lib/AST/ByteCode/ByteCodeEmitter.h
index ca8dc38e65246..dd18341d52a09 100644
--- a/clang/lib/AST/ByteCode/ByteCodeEmitter.h
+++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.h
@@ -25,11 +25,11 @@ enum Opcode : uint32_t;
 /// An emitter which links the program to bytecode for later use.
 class ByteCodeEmitter {
 protected:
-  using LabelTy = uint32_t;
   using AddrTy = uintptr_t;
   using Local = Scope::Local;
 
 public:
+  using LabelTy = uint32_t;
   /// Compiles the function into the module.
   void compileFunc(const FunctionDecl *FuncDecl, Function *Func = nullptr);
 
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 725db1f77f29c..958373ed94fe3 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -16,6 +16,7 @@
 #include "PrimType.h"
 #include "Program.h"
 #include "clang/AST/Attr.h"
+#include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
 using namespace clang::interp;
@@ -2500,17 +2501,18 @@ bool Compiler<Emitter>::VisitAbstractConditionalOperator(
   const Expr *TrueExpr = E->getTrueExpr();
   const Expr *FalseExpr = E->getFalseExpr();
 
-  auto visitChildExpr = [&](const Expr *E) -> bool {
-    LocalScope<Emitter> S(this);
-    if (!this->delegate(E))
-      return false;
-    return S.destroyLocals();
-  };
+  // The TrueExpr and FalseExpr of a conditional operator do _not_ create a
+  // scope, which means the local variables created within them unconditionally
+  // always exist. However, we need to later differentiate which branch was
+  // taken and only destroy the varibles of the active branch. This is what the
+  // "enabled" flags on local variables are used for.
+  llvm::SaveAndRestore LAAA(this->VarScope->LocalsAlwaysEnabled,
+                            /*NewValue=*/false);
 
   if (std::optional<bool> BoolValue = getBoolValue(Condition)) {
     if (*BoolValue)
-      return visitChildExpr(TrueExpr);
-    return visitChildExpr(FalseExpr);
+      return this->delegate(TrueExpr);
+    return this->delegate(FalseExpr);
   }
 
   bool IsBcpCall = false;
@@ -2542,13 +2544,15 @@ bool Compiler<Emitter>::VisitAbstractConditionalOperator(
 
   if (!this->jumpFalse(LabelFalse))
     return false;
-  if (!visitChildExpr(TrueExpr))
+  if (!this->delegate(TrueExpr))
     return false;
+
   if (!this->jump(LabelEnd))
     return false;
   this->emitLabel(LabelFalse);
-  if (!visitChildExpr(FalseExpr))
+  if (!this->delegate(FalseExpr))
     return false;
+
   this->fallthrough(LabelEnd);
   this->emitLabel(LabelEnd);
 
@@ -2955,10 +2959,15 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
     bool IsVolatile = SubExpr->getType().isVolatileQualified();
     unsigned LocalIndex = allocateLocalPrimitive(
         E, *SubExprT, IsConst, IsVolatile, E->getExtendingDecl());
+    if (!this->VarScope->LocalsAlwaysEnabled &&
+        !this->emitEnableLocal(LocalIndex, E))
+      return false;
+
     if (!this->visit(SubExpr))
       return false;
     if (!this->emitSetLocal(*SubExprT, LocalIndex, E))
       return false;
+
     return this->emitGetPtrLocal(LocalIndex, E);
   }
 
@@ -2968,6 +2977,11 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
   if (UnsignedOrNone LocalIndex =
           allocateLocal(E, Inner->getType(), E->getExtendingDecl())) {
     InitLinkScope<Emitter> ILS(this, InitLink::Temp(*LocalIndex));
+
+    if (!this->VarScope->LocalsAlwaysEnabled &&
+        !this->emitEnableLocal(*LocalIndex, E))
+      return false;
+
     if (!this->emitGetPtrLocal(*LocalIndex, E))
       return false;
     return this->visitInitializer(SubExpr) && this->emitFinishInit(E);
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 359bf28a51c6e..6a9fe1e954530 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -477,12 +477,14 @@ template <class Emitter> class VariableScope {
   VariableScope(Compiler<Emitter> *Ctx, const ValueDecl *VD,
                 ScopeKind Kind = ScopeKind::Block)
       : Ctx(Ctx), Parent(Ctx->VarScope), ValDecl(VD), Kind(Kind) {
+    if (Parent)
+      this->LocalsAlwaysEnabled = Parent->LocalsAlwaysEnabled;
     Ctx->VarScope = this;
   }
 
   virtual ~VariableScope() { Ctx->VarScope = this->Parent; }
 
-  virtual void addLocal(const Scope::Local &Local) {
+  virtual void addLocal(Scope::Local Local) {
     llvm_unreachable("Shouldn't be called");
   }
 
@@ -519,7 +521,6 @@ template <class Emitter> class VariableScope {
       if (!P)
         break;
     }
-
     // Add to this scope.
     this->addLocal(Local);
   }
@@ -529,6 +530,11 @@ template <class Emitter> class VariableScope {
   VariableScope *getParent() const { return Parent; }
   ScopeKind getKind() const { return Kind; }
 
+  /// Whether locals added to this scope are enabled by default.
+  /// This is almost always true, except for the two branches
+  /// of a conditional operator.
+  bool LocalsAlwaysEnabled = true;
+
 protected:
   /// Compiler instance.
   Compiler<Emitter> *Ctx;
@@ -566,29 +572,48 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
     return Success;
   }
 
-  void addLocal(const Scope::Local &Local) override {
+  void addLocal(Scope::Local Local) override {
     if (!Idx) {
       Idx = static_cast<unsigned>(this->Ctx->Descriptors.size());
       this->Ctx->Descriptors.emplace_back();
       this->Ctx->emitInitScope(*Idx, {});
     }
 
+    Local.EnabledByDefault = this->LocalsAlwaysEnabled;
     this->Ctx->Descriptors[*Idx].emplace_back(Local);
   }
 
   bool emitDestructors(const Expr *E = nullptr) override {
     if (!Idx)
       return true;
+    assert(!this->Ctx->Descriptors[*Idx].empty());
+
     // Emit destructor calls for local variables of record
     // type with a destructor.
     for (Scope::Local &Local : llvm::reverse(this->Ctx->Descriptors[*Idx])) {
       if (Local.Desc->hasTrivialDtor())
         continue;
-      if (!this->Ctx->emitGetPtrLocal(Local.Offset, E))
-        return false;
 
-      if (!this->Ctx->emitDestructionPop(Local.Desc, Local.Desc->getLoc()))
-        return false;
+      if (!Local.EnabledByDefault) {
+        typename Emitter::LabelTy EndLabel = this->Ctx->getLabel();
+        if (!this->Ctx->emitGetLocalEnabled(Local.Offset, E))
+          return false;
+        if (!this->Ctx->jumpFalse(EndLabel))
+          return false;
+
+        if (!this->Ctx->emitGetPtrLocal(Local.Offset, E))
+          return false;
+
+        if (!this->Ctx->emitDestructionPop(Local.Desc, Local.Desc->getLoc()))
+          return false;
+
+        this->Ctx->emitLabel(EndLabel);
+      } else {
+        if (!this->Ctx->emitGetPtrLocal(Local.Offset, E))
+          return false;
+        if (!this->Ctx->emitDestructionPop(Local.Desc, Local.Desc->getLoc()))
+          return false;
+      }
 
       removeIfStoredOpaqueValue(Local);
     }
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp
index 007321791fdd4..a2e01efc8ffd9 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.cpp
+++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp
@@ -113,7 +113,7 @@ Scope::Local EvalEmitter::createLocal(Descriptor *D) {
   InlineDescriptor &Desc = *reinterpret_cast<InlineDescriptor *>(B->rawData());
   Desc.Desc = D;
   Desc.Offset = sizeof(InlineDescriptor);
-  Desc.IsActive = true;
+  Desc.IsActive = false;
   Desc.IsBase = false;
   Desc.IsFieldMutable = false;
   Desc.IsConst = false;
@@ -322,6 +322,33 @@ bool EvalEmitter::emitDestroy(uint32_t I, SourceInfo Info) {
   return true;
 }
 
+bool EvalEmitter::emitGetLocalEnabled(uint32_t I, SourceInfo Info) {
+  if (!isActive())
+    return true;
+
+  Block *B = getLocal(I);
+  const InlineDescriptor &Desc =
+      *reinterpret_cast<InlineDescriptor *>(B->rawData());
+
+  S.Stk.push<bool>(Desc.IsActive);
+  return true;
+}
+
+bool EvalEmitter::emitEnableLocal(uint32_t I, SourceInfo Info) {
+  if (!isActive())
+    return true;
+
+  // FIXME: This is a little dirty, but to avoid adding a flag to
+  // InlineDescriptor that's only ever useful on the toplevel of local
+  // variables, we reuse the IsActive flag for the enabled state. We should
+  // probably use a different struct than InlineDescriptor for the block-level
+  // inline descriptor of local varaibles.
+  Block *B = getLocal(I);
+  InlineDescriptor &Desc = *reinterpret_cast<InlineDescriptor *>(B->rawData());
+  Desc.IsActive = true;
+  return true;
+}
+
 /// Global temporaries (LifetimeExtendedTemporary) carry their value
 /// around as an APValue, which codegen accesses.
 /// We set their value once when creating them, but we don't update it
diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h
index 95add5809afcc..8c309c921afa9 100644
--- a/clang/lib/AST/ByteCode/Function.h
+++ b/clang/lib/AST/ByteCode/Function.h
@@ -41,6 +41,8 @@ class Scope final {
     unsigned Offset;
     /// Descriptor of the local.
     Descriptor *Desc;
+    /// If the cleanup for this local should be emitted.
+    bool EnabledByDefault = true;
   };
 
   using LocalVectorTy = llvm::SmallVector<Local, 8>;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 3e869c1ee5f2c..86b1ba88ca9d4 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2474,6 +2474,18 @@ inline bool InitScope(InterpState &S, CodePtr OpPC, uint32_t I) {
   return true;
 }
 
+inline bool EnableLocal(InterpState &S, CodePtr OpPC, uint32_t I) {
+  assert(!S.Current->isLocalEnabled(I));
+  S.Current->enableLocal(I);
+  return true;
+}
+
+inline bool GetLocalEnabled(InterpState &S, CodePtr OpPC, uint32_t I) {
+  assert(S.Current);
+  S.Stk.push<bool>(S.Current->isLocalEnabled(I));
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Cast, CastFP
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index 039acb5d72b2c..3b883761ad001 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -89,11 +89,23 @@ void InterpFrame::destroyScopes() {
 void InterpFrame::initScope(unsigned Idx) {
   if (!Func)
     return;
+
   for (auto &Local : Func->getScope(Idx).locals()) {
     localBlock(Local.Offset)->invokeCtor();
   }
 }
 
+void InterpFrame::enableLocal(unsigned Idx) {
+  assert(Func);
+
+  // FIXME: This is a little dirty, but to avoid adding a flag to
+  // InlineDescriptor that's only ever useful on the toplevel of local
+  // variables, we reuse the IsActive flag for the enabled state. We should
+  // probably use a different struct than InlineDescriptor for the block-level
+  // inline descriptor of local varaibles.
+  localInlineDesc(Idx)->IsActive = true;
+}
+
 void InterpFrame::destroy(unsigned Idx) {
   for (auto &Local : Func->getScope(Idx).locals_reverse()) {
     S.deallocate(localBlock(Local.Offset));
diff --git a/clang/lib/AST/ByteCode/InterpFrame.h b/clang/lib/AST/ByteCode/InterpFrame.h
index febef1097ea8a..e150e9279a6ef 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.h
+++ b/clang/lib/AST/ByteCode/InterpFrame.h
@@ -55,6 +55,10 @@ class InterpFrame final : public Frame {
   void destroy(unsigned Idx);
   void initScope(unsigned Idx);
   void destroyScopes();
+  void enableLocal(unsigned Idx);
+  bool isLocalEnabled(unsigned Idx) const {
+    return localInlineDesc(Idx)->IsActive;
+  }
 
   /// Describes the frame with arguments for diagnostic purposes.
   void describe(llvm::raw_ostream &OS) const override;
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index a236f89dcf78b..6e768793fcfcf 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -251,6 +251,16 @@ def InitScope : Opcode {
   let Args = [ArgUint32];
 }
 
+def GetLocalEnabled : Opcode {
+  let Args = [ArgUint32];
+  let HasCustomEval = 1;
+}
+
+def EnableLocal : Opcode {
+  let Args = [ArgUint32];
+  let HasCustomEval = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Constants
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp
index c5d26925ce11b..819460628c1b7 100644
--- a/clang/test/AST/ByteCode/cxx23.cpp
+++ b/clang/test/AST/ByteCode/cxx23.cpp
@@ -473,3 +473,26 @@ namespace AIEWithIndex0Narrows {
   }
   static_assert(test());
 }
+
+#if __cplusplus >= 202302L
+namespace InactiveLocalsInConditionalOp {
+  struct A { constexpr A(){}; ~A(); constexpr int get() { return 10; } }; // all-note 2{{declared here}}
+  constexpr int get(bool b) {
+    return b ? A().get() : 1; // all-note {{non-constexpr function '~A' cannot be used in a constant expression}}
+  }
+  static_assert(get(false) == 1, "");
+  static_assert(get(true) == 10, ""); // all-error {{not an integral constant expression}} \
+                                      // all-note {{in call to}}
+
+  static_assert( (false ? A().get() : 1) == 1);
+  static_assert( (true ? A().get() : 1) == 1); // all-error {{not an integral constant expression}} \
+                                               // all-note {{non-constexpr function '~A' cannot be used in a constant expression}}
+
+  constexpr bool test2(bool b) {
+    unsigned long __ms = b ? (const unsigned long &)0 : __ms;
+    return true;
+  }
+  static_assert(test2(true));
+
+}
+#endif

From f5cae7b805946337f30437871ea6e13844507775 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 24 Nov 2025 18:07:03 +1100
Subject: [PATCH 11/38] [ORC] Add unit test for simple cycle in
 WaitingOnGraph::emit. (#169281)

WaitingOnGraphTests.Emit_SingleContainerSimpleCycle tests a pair of emit
operations where the second completes a simple cycle (1: A -> B, 2: B ->
A).

We already had a test of WaitingOnGraph::simplify's behavior in this
case, but did not have one for WaitingOnGraph::emit.
---
 .../Orc/WaitingOnGraphTest.cpp                | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp b/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp
index 08b4e8f40e3d0..0d4a5212c1f0c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp
@@ -399,6 +399,31 @@ TEST_F(WaitingOnGraphTest, Emit_TrivialSequence) {
   EXPECT_EQ(ER1.Failed.size(), 0U);
 }
 
+TEST_F(WaitingOnGraphTest, Emit_SingleContainerSimpleCycle) {
+  // Test an emit of two nodes with a dependence cycle within a single
+  // container:
+  // N0: (0, 0) -> (0, 1)
+  // N1: (0, 1) -> (0, 0)
+  // We expect intra-simplify cycle elimination to clear both dependence sets,
+  // and coalescing to join them into one supernode covering both defs.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{0, {1}}});
+  B.add(Defs0, Deps0);
+
+  auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER0.Ready.size(), 0U);
+  EXPECT_EQ(ER0.Failed.size(), 0U);
+
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Deps1({{0, {0}}});
+  B.add(Defs1, Deps1);
+  auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes()));
+
+  EXPECT_EQ(collapseDefs(ER1.Ready), merge(Defs0, Defs1));
+  EXPECT_EQ(ER1.Failed.size(), 0U);
+}
+
 TEST_F(WaitingOnGraphTest, Emit_TrivialReverseSequence) {
   // Perform a sequence of two emits where the first emit depends on the
   // second. Check that both nodes become ready after the second emit.

From 02a997cf365d7cf9759ee732f27241f1242a84b3 Mon Sep 17 00:00:00 2001
From: owenca <owenpiano@gmail.com>
Date: Mon, 24 Nov 2025 00:00:10 -0800
Subject: [PATCH 12/38] [clang-format] Handle `import` when used as template
 function name (#169279)

Fixes #149960
---
 clang/lib/Format/TokenAnnotator.cpp   | 5 ++++-
 clang/unittests/Format/FormatTest.cpp | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 19c42c88762fb..200ee13901f4b 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -5033,8 +5033,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
       return true;
     // Space between import <iostream>.
     // or import .....;
-    if (Left.is(Keywords.kw_import) && Right.isOneOf(tok::less, tok::ellipsis))
+    if (Left.is(Keywords.kw_import) &&
+        Right.isOneOf(tok::less, tok::ellipsis) &&
+        (!BeforeLeft || BeforeLeft->is(tok::kw_export))) {
       return true;
+    }
     // Space between `module :` and `import :`.
     if (Left.isOneOf(Keywords.kw_module, Keywords.kw_import) &&
         Right.is(TT_ModulePartitionColon)) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 81fa7d1d11aa4..5a5d77075bb3a 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -27328,6 +27328,7 @@ TEST_F(FormatTest, Cpp20ModulesSupport) {
   verifyFormat("export", Style);
 
   verifyFormat("import /* not keyword */ = val ? 2 : 1;");
+  verifyFormat("_world->import<engine_module>();");
 }
 
 TEST_F(FormatTest, CoroutineForCoawait) {

From c15a6cc00b1a0e3a47d99172b839ec45c72168ae Mon Sep 17 00:00:00 2001
From: ganenkokb-yandex <160136233+ganenkokb-yandex@users.noreply.github.com>
Date: Mon, 24 Nov 2025 11:47:29 +0300
Subject: [PATCH 13/38] [Clang][ASTImporter] Fix cycle in importing template
 specialization on auto type with typename (#162514)

ASTImporter on importing template specialization with auto return type
faces cycle when return type is not nested one, but typename from
template arguments and other template.
There is code, that prevents cycle to auto return types when nested type
declared. Solved case differs somehow from nested types, but have same
solution with UsedDifferentProtoType - with delayed return type
determining.
---
 clang/include/clang/AST/ASTImporter.h   | 16 ++++++++
 clang/lib/AST/ASTImporter.cpp           | 27 ++++++++++++-
 clang/unittests/AST/ASTImporterTest.cpp | 51 +++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h
index 4a0ca45b785a9..39d1429639ed1 100644
--- a/clang/include/clang/AST/ASTImporter.h
+++ b/clang/include/clang/AST/ASTImporter.h
@@ -190,6 +190,16 @@ class TypeSourceInfo;
       llvm::SmallDenseMap<Decl *, int, 32> Aux;
     };
 
+    class FunctionDeclImportCycleDetector {
+    public:
+      auto makeScopedCycleDetection(const FunctionDecl *D);
+
+      bool isCycle(const FunctionDecl *D) const;
+
+    private:
+      llvm::DenseSet<const FunctionDecl *> FunctionDeclsWithImportInProgress;
+    };
+
   private:
     std::shared_ptr<ASTImporterSharedState> SharedState = nullptr;
 
@@ -254,6 +264,12 @@ class TypeSourceInfo;
     /// Declaration (from, to) pairs that are known not to be equivalent
     /// (which we have already complained about).
     NonEquivalentDeclSet NonEquivalentDecls;
+    /// A FunctionDecl can have properties that have a reference to the
+    /// function itself and are imported before the function is created. This
+    /// can come for example from auto return type or when template parameters
+    /// are used in the return type or parameters. This member is used to detect
+    /// cyclic import of FunctionDecl objects to avoid infinite recursion.
+    FunctionDeclImportCycleDetector FindFunctionDeclImportCycle;
 
     using FoundDeclsTy = SmallVector<NamedDecl *, 2>;
     FoundDeclsTy findDeclsInToCtx(DeclContext *DC, DeclarationName Name);
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 735f3157b694e..c1441744c8578 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1291,6 +1291,26 @@ bool ASTNodeImporter::hasSameVisibilityContextAndLinkage(TypedefNameDecl *Found,
 
 using namespace clang;
 
+auto ASTImporter::FunctionDeclImportCycleDetector::makeScopedCycleDetection(
+    const FunctionDecl *D) {
+  const FunctionDecl *LambdaD = nullptr;
+  if (!isCycle(D) && D) {
+    FunctionDeclsWithImportInProgress.insert(D);
+    LambdaD = D;
+  }
+  return llvm::make_scope_exit([this, LambdaD]() {
+    if (LambdaD) {
+      FunctionDeclsWithImportInProgress.erase(LambdaD);
+    }
+  });
+}
+
+bool ASTImporter::FunctionDeclImportCycleDetector::isCycle(
+    const FunctionDecl *D) const {
+  return FunctionDeclsWithImportInProgress.find(D) !=
+         FunctionDeclsWithImportInProgress.end();
+}
+
 ExpectedType ASTNodeImporter::VisitType(const Type *T) {
   Importer.FromDiag(SourceLocation(), diag::err_unsupported_ast_node)
     << T->getTypeClassName();
@@ -4038,7 +4058,10 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
     // E.g.: auto foo() { struct X{}; return X(); }
     // To avoid an infinite recursion when importing, create the FunctionDecl
     // with a simplified return type.
-    if (hasReturnTypeDeclaredInside(D)) {
+    // Reuse this approach for auto return types declared as typenames from
+    // template params, tracked in FindFunctionDeclImportCycle.
+    if (hasReturnTypeDeclaredInside(D) ||
+        Importer.FindFunctionDeclImportCycle.isCycle(D)) {
       FromReturnTy = Importer.getFromContext().VoidTy;
       UsedDifferentProtoType = true;
     }
@@ -4061,6 +4084,8 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
   }
 
   Error Err = Error::success();
+  auto ScopedReturnTypeDeclCycleDetector =
+      Importer.FindFunctionDeclImportCycle.makeScopedCycleDetection(D);
   auto T = importChecked(Err, FromTy);
   auto TInfo = importChecked(Err, FromTSI);
   auto ToInnerLocStart = importChecked(Err, D->getInnerLocStart());
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 3cab4c600b1b1..164790606ea0b 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3204,6 +3204,57 @@ TEST_P(ImportExpr, UnresolvedMemberExpr) {
                  compoundStmt(has(callExpr(has(unresolvedMemberExpr())))))))));
 }
 
+TEST_P(ImportDecl, CycleInAutoTemplateSpec) {
+  MatchVerifier<Decl> Verifier;
+  const char *Code = R"(
+  template <class _CharT>
+  struct basic_string {
+    using value_type = _CharT;
+  };
+
+  template<typename T>
+  struct basic_string_view {
+    using value_type = T;
+  };
+
+  using string_view = basic_string_view<char>;
+  using string = basic_string<char>;
+
+  template<typename T>
+  struct span {
+  };
+
+  template <typename StringT>
+  auto StrCatT(span<const StringT> pieces) {
+    basic_string<typename StringT::value_type> result;
+    return result;
+  }
+
+  string StrCat(span<const string_view> pieces) {
+      return StrCatT(pieces);
+  }
+
+  string StrCat(span<const string> pieces) {
+      return StrCatT(pieces);
+  }
+
+  template <typename T>
+  auto declToImport(T pieces) {
+    return StrCat(pieces);
+  }
+
+  void test() {
+    span<const string> pieces;
+    auto result = declToImport(pieces);
+  }
+)";
+  // This test reproduces the StrCatT recursion pattern with concepts and span
+  // that may cause infinite recursion during AST import due to circular
+  // dependencies
+  testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionTemplateDecl(hasName("declToImport")));
+}
+
 TEST_P(ImportExpr, ConceptNoRequirement) {
   MatchVerifier<Decl> Verifier;
   const char *Code = R"(

From e888cf863d5c0a83933f97cac04ae5dc5010e1a1 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Mon, 24 Nov 2025 14:25:53 +0530
Subject: [PATCH 14/38] [AMDGPU] Add wave reduce intrinsics for float types - 2
 (#168859)

Supported Ops: `fadd`, `fsub`
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |    2 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |    2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   42 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    2 +
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll  | 1001 +++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll  | 1001 +++++++++++++++++
 6 files changed, 2046 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 539036b283498..c2057ac3a14e6 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2482,7 +2482,7 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_any_ty> : Intrinsic<
 
 multiclass AMDGPUWaveReduceOps {
   foreach Op =
-      ["umin", "fmin", "min", "umax", "fmax", "max", "add", "sub", "and", "or", "xor"] in {
+      ["umin", "fmin", "min", "umax", "fmax", "max", "add", "fadd", "sub", "fsub", "and", "or", "xor"] in {
     def Op : AMDGPUWaveReduce;
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c6131d6b3b050..a88e4b2a2a31d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5214,7 +5214,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     case Intrinsic::amdgcn_wave_reduce_add:
+    case Intrinsic::amdgcn_wave_reduce_fadd:
     case Intrinsic::amdgcn_wave_reduce_sub:
+    case Intrinsic::amdgcn_wave_reduce_fsub:
     case Intrinsic::amdgcn_wave_reduce_min:
     case Intrinsic::amdgcn_wave_reduce_umin:
     case Intrinsic::amdgcn_wave_reduce_fmin:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c681d12ba7499..4fc397b51f1b1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5480,6 +5480,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
     return std::numeric_limits<uint32_t>::min();
   case AMDGPU::S_MAX_I32:
     return std::numeric_limits<int32_t>::min();
+  case AMDGPU::V_ADD_F32_e64: // -0.0
+    return 0x80000000;
+  case AMDGPU::V_SUB_F32_e64: // +0.0
+    return 0x0;
   case AMDGPU::S_ADD_I32:
   case AMDGPU::S_SUB_I32:
   case AMDGPU::S_OR_B32:
@@ -5525,11 +5529,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) {
          Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
          Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
          Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
-         Opc == AMDGPU::V_MAX_F32_e64;
+         Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
+         Opc == AMDGPU::V_SUB_F32_e64;
 }
 
 static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
-  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64;
+  return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
+         Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64;
 }
 
 static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
@@ -5576,8 +5582,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     case AMDGPU::S_XOR_B64:
     case AMDGPU::S_ADD_I32:
     case AMDGPU::S_ADD_U64_PSEUDO:
+    case AMDGPU::V_ADD_F32_e64:
     case AMDGPU::S_SUB_I32:
-    case AMDGPU::S_SUB_U64_PSEUDO: {
+    case AMDGPU::S_SUB_U64_PSEUDO:
+    case AMDGPU::V_SUB_F32_e64: {
       const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
       const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
       Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5732,6 +5740,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
             .addImm(AMDGPU::sub1);
         break;
       }
+      case AMDGPU::V_ADD_F32_e64:
+      case AMDGPU::V_SUB_F32_e64: {
+        Register ActiveLanesVreg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        // Get number of active lanes as a float val.
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64),
+                ActiveLanesVreg)
+            .addReg(NewAccumulator->getOperand(0).getReg())
+            .addImm(0)  // clamp
+            .addImm(0); // output-modifier
+
+        // Take negation of input for SUB reduction
+        unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg)
+            .addImm(srcMod) // src0 modifier
+            .addReg(SrcReg)
+            .addImm(0) // src1 modifier
+            .addReg(ActiveLanesVreg)
+            .addImm(0)  // clamp
+            .addImm(0); // output-mod
+        BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+            .addReg(DstVreg);
+      }
       }
       RetBB = &BB;
     }
@@ -5979,10 +6011,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
   case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
   case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
+  case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
+    return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
     return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
   case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1282ece9dc875..3fe37e8217f35 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -374,6 +374,8 @@ defvar Operations = [
 
   WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>,
   WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>,
+  WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>,
 ];
 
 foreach Op = Operations in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
index a7ebf458d2591..2cb1811ff4f09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll
@@ -2019,6 +2019,1007 @@ endif:
   store i64 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: uniform_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    v_add_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_brev_b32 s6, 1
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    v_add_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT:    v_add_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    s_brev_b32 s5, 1
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT:    v_add_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT:    v_add_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_brev_b32 s2, 1
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    v_add_f32_e64 v3, s2, s4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT:    v_add_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    s_brev_b32 s1, 1
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT:    v_add_f32_e64 v3, s1, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT:    s_brev_b32 s1, 1
+; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT:    v_add_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX12DAGISEL-NEXT:  ; %bb.2:
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fadd(float %id.x, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[1:2], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s0, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: divergent_cfg_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e32 v0, s1, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fadd(float %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
+  store float %combine, ptr addrspace(1) %out
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
index fab269ea8cfb9..50ec375d04626 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll
@@ -2220,6 +2220,1007 @@ endif:
   store i64 %combine, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) {
+; GFX8DAGISEL-LABEL: uniform_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: uniform_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: uniform_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s0, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s2, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) {
+; GFX8DAGISEL-LABEL: divergent_value_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    v_sub_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    v_sub_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    v_sub_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, 0
+; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT:    v_sub_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032DAGISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT:    v_sub_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1032GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-NEXT:    v_sub_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    v_sub_f32_e64 v3, s2, s4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0
+; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1132GISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT:    s_mov_b32 s1, 0
+; GFX12DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT:    s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT:    v_sub_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX12DAGISEL-NEXT:  ; %bb.2:
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12DAGISEL-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call float @llvm.amdgcn.wave.reduce.fsub(float %id.x, i32 1)
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) {
+; GFX8DAGISEL-LABEL: divergent_cfg_float:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX8DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8DAGISEL-NEXT:    flat_store_dword v[1:2], v0
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: divergent_cfg_float:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX8GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX8GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: divergent_cfg_float:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX9DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: divergent_cfg_float:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX9GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX9GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX9GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: divergent_cfg_float:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1064DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1064DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: divergent_cfg_float:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1064GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1064GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1064GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: divergent_cfg_float:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1032DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: divergent_cfg_float:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1032GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s0, s3
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1032GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1032GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-LABEL: divergent_cfg_float:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GFX1164GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: divergent_cfg_float:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1132DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1132DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-LABEL: divergent_cfg_float:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s3
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX1132GISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX1132GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT:    s_endpgm
+;
+; GFX12DAGISEL-LABEL: divergent_cfg_float:
+; GFX12DAGISEL:       ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX12DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    ; implicit-def: $sgpr3
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX12DAGISEL-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX12DAGISEL-NEXT:  ; %bb.1: ; %else
+; GFX12DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s0, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    s_or_saveexec_b32 s0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX12DAGISEL-NEXT:  ; %bb.3: ; %if
+; GFX12DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12DAGISEL-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mul_f32_e64 v0, -s1, v0
+; GFX12DAGISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12DAGISEL-NEXT:    s_wait_alu 0xf1ff
+; GFX12DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12DAGISEL-NEXT:  .LBB8_4: ; %endif
+; GFX12DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call float @llvm.amdgcn.wave.reduce.fsub(float %in2, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else]
+  store float %combine, ptr addrspace(1) %out
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10DAGISEL: {{.*}}
 ; GFX10GISEL: {{.*}}

From 1abb055c57c977f98267bdb89c856adfaa71e892 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 24 Nov 2025 08:57:55 +0000
Subject: [PATCH 15/38] [IVDesc] Make getCastInsts return an ArrayRef (NFC)
 (#169021)

To make it clear that the return value is immutable.
---
 llvm/include/llvm/Analysis/IVDescriptors.h                  | 6 ++----
 llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp             | 3 +--
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp           | 2 +-
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 654a5f10cea96..2c8484fde5b16 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -451,12 +451,10 @@ class InductionDescriptor {
                           : Instruction::BinaryOpsEnd;
   }
 
-  /// Returns a reference to the type cast instructions in the induction
+  /// Returns an ArrayRef to the type cast instructions in the induction
   /// update chain, that are redundant when guarded with a runtime
   /// SCEV overflow check.
-  const SmallVectorImpl<Instruction *> &getCastInsts() const {
-    return RedundantCasts;
-  }
+  ArrayRef<Instruction *> getCastInsts() const { return RedundantCasts; }
 
 private:
   /// Private constructor - used by \c isInductionPHI.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 86e742ca5fec1..ba21bbbe112e6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -684,7 +684,7 @@ void LoopVectorizationLegality::addInductionPhi(
   // in the vectorized loop body, record them here. All casts could be recorded
   // here for ignoring, but suffices to record only the first (as it is the
   // only one that may bw used outside the cast sequence).
-  const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+  ArrayRef<Instruction *> Casts = ID.getCastInsts();
   if (!Casts.empty())
     InductionCastsToIgnore.insert(*Casts.begin());
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 277e43a38018e..7ac132a99fbec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6575,8 +6575,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   // detection.
   for (const auto &Induction : Legal->getInductionVars()) {
     const InductionDescriptor &IndDes = Induction.second;
-    const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
-    VecValuesToIgnore.insert_range(Casts);
+    VecValuesToIgnore.insert_range(IndDes.getCastInsts());
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2220d1a045de4..e7a8773be067b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -512,7 +512,7 @@ static void removeRedundantInductionCasts(VPlan &Plan) {
     // replace it with the original IV. Note that only the final cast is
     // expected to have users outside the cast-chain and the dead casts left
     // over will be cleaned up later.
-    auto &Casts = IV->getInductionDescriptor().getCastInsts();
+    ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
     VPValue *FindMyCast = IV;
     for (Instruction *IRCast : reverse(Casts)) {
       VPSingleDefRecipe *FoundUserCast = nullptr;

From ce70d4b5b5130f2ac8586c3dd2198dc91771f534 Mon Sep 17 00:00:00 2001
From: Gil Rapaport <gil.rapaport@mobileye.com>
Date: Mon, 24 Nov 2025 11:08:01 +0200
Subject: [PATCH 16/38] [mlir][emitc] Refactor getEmittedExpression (NFC)
 (#168361)

This method returns the current expression being emitted, but is only
used testing whether an expression is being emitted or not. This patch
therefore replaces it with a boolean isEmittingExpression() method.
---
 mlir/lib/Target/Cpp/TranslateToCpp.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 56f81b0bea9e2..bcce0a5145221 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -253,8 +253,8 @@ struct CppEmitter {
     return !fileId.empty() && file.getId() == fileId;
   }
 
-  /// Get expression currently being emitted.
-  ExpressionOp getEmittedExpression() { return emittedExpression; }
+  /// Is expression currently being emitted.
+  bool isEmittingExpression() { return emittedExpression; }
 
   /// Determine whether given value is part of the expression potentially being
   /// emitted.
@@ -1717,7 +1717,7 @@ LogicalResult CppEmitter::emitGlobalVariable(GlobalOp op) {
 
 LogicalResult CppEmitter::emitAssignPrefix(Operation &op) {
   // If op is being emitted as part of an expression, bail out.
-  if (getEmittedExpression())
+  if (isEmittingExpression())
     return success();
 
   switch (op.getNumResults()) {
@@ -1799,7 +1799,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
   if (hasDeferredEmission(&op))
     return success();
 
-  if (getEmittedExpression() ||
+  if (isEmittingExpression() ||
       (isa<emitc::ExpressionOp>(op) &&
        shouldBeInlined(cast<emitc::ExpressionOp>(op))))
     return success();

From f21857313dfab543e66ef43b1aed43b685794a7c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 24 Nov 2025 09:09:30 +0000
Subject: [PATCH 17/38] [llvm][CAS] Remove unused functions (#168856)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building with GCC I got:
```
<...>/OnDiskGraphDB.cpp:624:18: warning: ‘static {anonymous}::DataRecordHandle {anonymous}::DataRecordHandle::construct(char*, const {anonymous}::DataRecordHandle::Input&)’ defined but not used [-Wunused-function]
  624 | DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
      |                  ^~~~~~~~~~~~~~~~
<...>/OnDiskGraphDB.cpp:456:1: warning: ‘static {anonymous}::DataRecordHandle {anonymous}::DataRecordHandle::create(llvm::function_ref<char*(long unsigned int)>, const {anonymous}::DataRecordHandle::Input&)’ defined but not used [-Wunused-function]
  456 | DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
      | ^~~~~~~~~~~~~~~~
```

These implement parts of a class that is defined in an anonymous
namespace. All llvm tests passed with them removed.
---
 llvm/lib/CAS/OnDiskGraphDB.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 2aede017133b0..84f27c4938050 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -452,13 +452,6 @@ Expected<DataRecordHandle> DataRecordHandle::createWithError(
     return Mem.takeError();
 }
 
-DataRecordHandle
-DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
-                         const Input &I) {
-  Layout L(I);
-  return constructImpl(Alloc(L.getTotalSize()), I, L);
-}
-
 ObjectHandle ObjectHandle::fromFileOffset(FileOffset Offset) {
   // Store the file offset as it is.
   assert(!(Offset.get() & 0x1));
@@ -621,10 +614,6 @@ bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
   return false;
 }
 
-DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
-  return constructImpl(Mem, I, Layout(I));
-}
-
 Expected<DataRecordHandle>
 DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
                                   FileOffset Offset) {

From c745a512dcfaa550c58b42bedd06464b7f593a26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ingo=20M=C3=BCller?= <ingomueller@google.com>
Date: Mon, 24 Nov 2025 09:22:46 +0000
Subject: [PATCH 18/38] [mlir:x86vector:transform] Fix bazel build after
 #168074. (#169294)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR fixes the bazel build that went out of sync with the changes
introduced in #168074.

Signed-off-by: Ingo Müller <ingomueller@google.com>
---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 1421ec553f251..b7fe0fa898c21 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2578,6 +2578,45 @@ cc_library(
     ],
 )
 
+td_library(
+    name = "X86VectorTransformOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td",
+    ],
+    includes = ["include"],
+    deps = [
+        ":OpBaseTdFiles",
+        ":SideEffectInterfacesTdFiles",
+        ":TransformDialectTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "X86VectorTransformOpsIncGen",
+    tbl_outs = {
+        "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h.inc": ["-gen-op-decls"],
+        "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp.inc": ["-gen-op-defs"],
+    },
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td",
+    deps = [
+        ":X86VectorTransformOpsTdFiles",
+    ],
+)
+
+cc_library(
+    name = "X86VectorTransformOps",
+    srcs = glob(["lib/Dialect/X86Vector/TransformOps/*.cpp"]),
+    hdrs = glob(["include/mlir/Dialect/X86Vector/TransformOps/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":TransformDialect",
+        ":VectorDialect",
+        ":X86VectorTransformOpsIncGen",
+        ":X86VectorTransforms",
+    ],
+)
+
 cc_library(
     name = "X86VectorTransforms",
     srcs = glob(["lib/Dialect/X86Vector/Transforms/*.cpp"]),
@@ -2588,6 +2627,7 @@ cc_library(
         ":IR",
         ":LLVMCommonConversion",
         ":LLVMDialect",
+        ":LinalgDialect",
         ":VectorDialect",
         ":VectorUtils",
         ":X86VectorDialect",
@@ -9571,6 +9611,7 @@ cc_library(
         ":UBToLLVM",
         ":VectorToLLVM",
         ":VectorTransformOps",
+        ":X86VectorTransformOps",
         ":XeGPUTransformOps",
         ":XeVMToLLVM",
         ":XeVMToLLVMIRTranslation",

From 6413e5a2df8ca75c4b54b2577bbec9a9d31911b0 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 24 Nov 2025 10:29:28 +0100
Subject: [PATCH 19/38] [clangd] Implement fold range for #pragma region
 (#168177)

The implementation is based on the directive tree.

Fixes https://github.com/clangd/clangd/issues/1623
---
 .../clangd/SemanticSelection.cpp              | 81 ++++++++++++++++++-
 .../unittests/SemanticSelectionTests.cpp      | 22 +++++
 2 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clangd/SemanticSelection.cpp b/clang-tools-extra/clangd/SemanticSelection.cpp
index 3353121a01825..c2dad53bcec6b 100644
--- a/clang-tools-extra/clangd/SemanticSelection.cpp
+++ b/clang-tools-extra/clangd/SemanticSelection.cpp
@@ -11,9 +11,13 @@
 #include "Protocol.h"
 #include "Selection.h"
 #include "SourceCode.h"
+#include "support/Bracket.h"
+#include "support/DirectiveTree.h"
+#include "support/Token.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
 #include "clang/Tooling/Syntax/BuildTree.h"
 #include "clang/Tooling/Syntax/Nodes.h"
 #include "clang/Tooling/Syntax/TokenBufferTokenManager.h"
@@ -22,9 +26,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
-#include "support/Bracket.h"
-#include "support/DirectiveTree.h"
-#include "support/Token.h"
 #include <optional>
 #include <queue>
 #include <vector>
@@ -163,6 +164,69 @@ llvm::Expected<SelectionRange> getSemanticRanges(ParsedAST &AST, Position Pos) {
   return std::move(Head);
 }
 
+class PragmaRegionFinder {
+  // Record the token range of a region:
+  //
+  //   #pragma region name[[
+  //   ...
+  //   ]]#pragma endregion
+  std::vector<Token::Range> &Ranges;
+  const TokenStream &Code;
+  // Stack of starting token (the name of the region) indices for nested #pragma
+  // region.
+  std::vector<unsigned> Stack;
+
+public:
+  PragmaRegionFinder(std::vector<Token::Range> &Ranges, const TokenStream &Code)
+      : Ranges(Ranges), Code(Code) {}
+
+  void walk(const DirectiveTree &T) {
+    for (const auto &C : T.Chunks)
+      std::visit(*this, C);
+  }
+
+  void operator()(const DirectiveTree::Code &C) {}
+
+  void operator()(const DirectiveTree::Directive &D) {
+    // Get the tokens that make up this directive.
+    auto Tokens = Code.tokens(D.Tokens);
+    if (Tokens.empty())
+      return;
+    const Token &HashToken = Tokens.front();
+    assert(HashToken.Kind == tok::hash);
+    const Token &Pragma = HashToken.nextNC();
+    if (Pragma.text() != "pragma")
+      return;
+    const Token &Value = Pragma.nextNC();
+
+    // Handle "#pragma region name"
+    if (Value.text() == "region") {
+      // Find the last token at the same line.
+      const Token *T = &Value.next();
+      while (T < Tokens.end() && T->Line == Pragma.Line)
+        T = &T->next();
+      --T;
+      Stack.push_back(T->OriginalIndex);
+      return;
+    }
+
+    // Handle "#pragma endregion"
+    if (Value.text() == "endregion") {
+      if (Stack.empty())
+        return; // unmatched end region; ignore.
+
+      unsigned StartIdx = Stack.back();
+      Stack.pop_back();
+      Ranges.push_back(Token::Range{StartIdx, HashToken.OriginalIndex});
+    }
+  }
+
+  void operator()(const DirectiveTree::Conditional &C) {
+    for (const auto &[_, SubTree] : C.Branches)
+      walk(SubTree);
+  }
+};
+
 // FIXME(kirillbobyrev): Collect comments, PP conditional regions, includes and
 // other code regions (e.g. public/private/protected sections of classes,
 // control flow statement bodies).
@@ -286,6 +350,17 @@ getFoldingRanges(const std::string &Code, bool LineFoldingOnly) {
     }
     AddFoldingRange(Start, End, FoldingRange::COMMENT_KIND);
   }
+
+  // #pragma region
+  std::vector<Token::Range> Ranges;
+  PragmaRegionFinder(Ranges, OrigStream).walk(DirectiveStructure);
+  auto Ts = OrigStream.tokens();
+  for (const auto &R : Ranges) {
+    auto End = StartPosition(Ts[R.End]);
+    if (LineFoldingOnly)
+      End.line--;
+    AddFoldingRange(EndPosition(Ts[R.Begin]), End, FoldingRange::REGION_KIND);
+  }
   return Result;
 }
 
diff --git a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
index 4efae25dcd077..2a381d1c8add5 100644
--- a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
@@ -410,6 +410,22 @@ TEST(FoldingRanges, PseudoParserWithoutLineFoldings) {
             Variable = 3;
         #
       )cpp",
+      R"cpp(
+        #pragma region R1[[
+        
+        #pragma region R2[[
+         constexpr int a = 2;
+        ]]#pragma endregion
+        
+        ]]#pragma endregion
+      )cpp",
+      R"cpp(
+        #pragma region[[
+        ]]#pragma endregion
+
+        #pragma /*comment1*/ region /*comment2*/name[[
+        ]]#pragma endregion
+      )cpp",
   };
   for (const char *Test : Tests) {
     auto T = Annotations(Test);
@@ -470,6 +486,12 @@ TEST(FoldingRanges, PseudoParserLineFoldingsOnly) {
         //[[ foo
         /* bar */]]
       )cpp",
+      R"cpp(
+        #pragma region abc[[
+        constexpr int a = 2;
+        ]]
+        #pragma endregion
+      )cpp",
       // FIXME: Support folding template arguments.
       // R"cpp(
       // template <[[typename foo, class bar]]> struct baz {};

From 30b1d1422733c012c274f173a3f4986615f7c1c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20Benics?= <benicsbalazs@gmail.com>
Date: Mon, 24 Nov 2025 09:30:11 +0000
Subject: [PATCH 20/38] [analyzer] Fix inf recursion in StackAddrEscapeChecker
 for self referencing blocks (#169208)

Objective-C blocks are like lambdas. They have captures, just like lambdas.
However, they can also implicitly capture themselves unlike lambdas.

This means that when walking the captures of a block, we may end up in
infinite recursion. This is not possible with lambdas, but happened in
practice with blocks downstream.

In this patch, I just use a set to keep track of the visited MemRegions.

Note that theoretically, there is nothing preventing usual lambdas or
functors from falling for the same trap, but probably slightly more
difficult to do so. You would likely need a pointer to itself, etc. I'll
not speculate here.

This inf recursion was likely caused by #126620, released in clang-21.

rdar://162215172
---
 .../Checkers/StackAddrEscapeChecker.cpp           |  5 +++++
 clang/test/Analysis/stackaddrleak.c               | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
index 019e81f91400d..027bf780273cc 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp
@@ -22,6 +22,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace clang;
 using namespace ento;
@@ -247,6 +248,7 @@ class FindStackRegionsSymbolVisitor final : public SymbolVisitor {
   CheckerContext &Ctxt;
   const StackFrameContext *PoppedStackFrame;
   SmallVectorImpl<const MemRegion *> &EscapingStackRegions;
+  llvm::SmallPtrSet<const MemRegion *, 16> VisitedRegions;
 
 public:
   explicit FindStackRegionsSymbolVisitor(
@@ -258,6 +260,9 @@ class FindStackRegionsSymbolVisitor final : public SymbolVisitor {
   bool VisitSymbol(SymbolRef sym) override { return true; }
 
   bool VisitMemRegion(const MemRegion *MR) override {
+    if (!VisitedRegions.insert(MR).second)
+      return true;
+
     SaveIfEscapes(MR);
 
     if (const BlockDataRegion *BDR = MR->getAs<BlockDataRegion>())
diff --git a/clang/test/Analysis/stackaddrleak.c b/clang/test/Analysis/stackaddrleak.c
index 95175996e8274..96bd4e4ea19e5 100644
--- a/clang/test/Analysis/stackaddrleak.c
+++ b/clang/test/Analysis/stackaddrleak.c
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s
+// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s
 
 typedef __INTPTR_TYPE__ intptr_t;
 char const *p;
@@ -90,3 +90,14 @@ struct child_stack_context_s return_child_stack_context_field() {
   }
   return s; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}}
 }
+
+// Returns an 'int' block taking an 'int'.
+int (^copy_self_referencing_block(void))(int) {
+  // It is important that the 'fib' block captures itself.
+  __block int (^fib)(int) = ^(int n) {
+    if (n <= 1) return n;
+    return fib(n - 1) + fib(n - 2);
+  };
+  return fib; // no-crash when copying a self-referencing 'fib'
+  // expected-warning-re@-1 {{Address of stack-allocated block declared on line {{[0-9]+}} is captured by a returned block}}
+}

From 4604762cc336317b0f02f7d8c1576f6205f4ea61 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Mon, 24 Nov 2025 15:13:11 +0530
Subject: [PATCH 21/38] [AMDGPU] Add builtins for wave reduction intrinsics
 (#161816)

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def |  4 +
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp  | 12 +++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl  | 84 ++++++++++++++++++++
 3 files changed, 100 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 81e684a04a03d..a3ded0f6a9983 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
 BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fadd_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fsub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fmin_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fmax_f32, "ffZi", "nc")
 
 //===----------------------------------------------------------------------===//
 // R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 92ee69218620b..81b3fe9e79483 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -390,18 +390,26 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
     return Intrinsic::amdgcn_wave_reduce_add;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
+    return Intrinsic::amdgcn_wave_reduce_fadd;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
     return Intrinsic::amdgcn_wave_reduce_sub;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
+    return Intrinsic::amdgcn_wave_reduce_fsub;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
     return Intrinsic::amdgcn_wave_reduce_min;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
+    return Intrinsic::amdgcn_wave_reduce_fmin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
     return Intrinsic::amdgcn_wave_reduce_umin;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
     return Intrinsic::amdgcn_wave_reduce_max;
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
+    return Intrinsic::amdgcn_wave_reduce_fmax;
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
   case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
     return Intrinsic::amdgcn_wave_reduce_umax;
@@ -423,11 +431,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   llvm::SyncScope::ID SSID;
   switch (BuiltinID) {
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+  case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index b92454de60c78..a5132c9114673 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -405,6 +405,13 @@ void test_wave_reduce_add_u64_default(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_add_u32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
 void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -419,6 +426,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_add_u32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
 void test_wave_reduce_add_u32_dpp(global int* out, int in)
@@ -433,6 +447,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_sub_u32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
 void test_wave_reduce_sub_u32_default(global int* out, int in)
@@ -447,6 +468,13 @@ void test_wave_reduce_sub_u64_default(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
 void test_wave_reduce_sub_u32_iterative(global int* out, int in)
@@ -461,6 +489,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
 void test_wave_reduce_sub_u32_dpp(global int* out, int in)
@@ -475,6 +510,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_and_b32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
 void test_wave_reduce_and_b32_default(global int* out, int in)
@@ -615,6 +657,13 @@ void test_wave_reduce_min_i64_default(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_min_i32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
 void test_wave_reduce_min_i32_iterative(global int* out, int in)
@@ -629,6 +678,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_min_i32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
 void test_wave_reduce_min_i32_dpp(global int* out, int in)
@@ -643,6 +699,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_min_u32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
 void test_wave_reduce_min_u32_default(global int* out, int in)
@@ -699,6 +762,13 @@ void test_wave_reduce_max_i64_default(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_default(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_max_i32_iterative
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
 void test_wave_reduce_max_i32_iterative(global int* out, int in)
@@ -713,6 +783,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_iterative(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_max_i32_dpp
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
 void test_wave_reduce_max_i32_dpp(global int* out, int in)
@@ -727,6 +804,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in)
   *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
 }
 
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_dpp(global float* out, float in)
+{
+  *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
+}
+
 // CHECK-LABEL: @test_wave_reduce_max_u32_default
 // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
 void test_wave_reduce_max_u32_default(global int* out, int in)

From 4b65cafa182a9b91131bfce986e815c9a4ab6ae5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <macdue@dueutil.tech>
Date: Mon, 24 Nov 2025 10:00:17 +0000
Subject: [PATCH 22/38] [AArch64][SVE] Add custom lowering for bfloat FMUL
 (with +bf16) (#167502)

This lowers an SVE FMUL of bf16 using the BFMLAL top/bottom instructions
rather than extending to an f32 mul. This does require zeroing the
accumulator, but requires fewer extends/unpacking.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  72 ++++++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   1 +
 llvm/test/CodeGen/AArch64/sve-bf16-arith.ll   |  85 +++++---
 .../test/CodeGen/AArch64/sve-bf16-combines.ll | 198 +++++++-----------
 4 files changed, 205 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0bbe117ecf51f..e91f5a877b35b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1809,11 +1809,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     if (!Subtarget->hasSVEB16B16() ||
         !Subtarget->isNonStreamingSVEorSME2Available()) {
-      for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
-                          ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
-        setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
-        setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
-        setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
+      for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
+        MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
+        setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
+        setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
+        setOperationPromotedToType(ISD::FMAXIMUM, VT, PromotedVT);
+        setOperationPromotedToType(ISD::FMAXNUM, VT, PromotedVT);
+        setOperationPromotedToType(ISD::FMINIMUM, VT, PromotedVT);
+        setOperationPromotedToType(ISD::FMINNUM, VT, PromotedVT);
+        setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
+
+        if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
+          setOperationAction(ISD::FMUL, VT, Custom);
+        else
+          setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
       }
     }
 
@@ -7670,6 +7679,57 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                      EndOfTrmp);
 }
 
+SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  if (VT.getScalarType() != MVT::bf16 ||
+      (Subtarget->hasSVEB16B16() &&
+       Subtarget->isNonStreamingSVEorSME2Available()))
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
+
+  assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
+  assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16) && "Unexpected FMUL VT");
+
+  auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
+    return [&, IID](EVT VT, auto... Ops) {
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(IID, DL, MVT::i32), Ops...);
+    };
+  };
+
+  auto ReinterpretCast = [&](SDValue Value, EVT VT) {
+    if (VT == Value.getValueType())
+      return Value;
+    return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
+  };
+
+  // Create helpers for building intrinsic calls.
+  auto BFMLALB = MakeGetIntrinsic(Intrinsic::aarch64_sve_bfmlalb);
+  auto BFMLALT = MakeGetIntrinsic(Intrinsic::aarch64_sve_bfmlalt);
+  auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
+  auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
+
+  // All intrinsics expect to operate on full bf16 vector types.
+  SDValue LHS = ReinterpretCast(Op.getOperand(0), MVT::nxv8bf16);
+  SDValue RHS = ReinterpretCast(Op.getOperand(1), MVT::nxv8bf16);
+
+  SDValue Zero =
+      DAG.getNeutralElement(ISD::FADD, DL, MVT::nxv4f32, Op->getFlags());
+  SDValue Pg = DAG.getConstant(1, DL, MVT::nxv4i1);
+
+  // Lower bf16 FMUL as a pair (VT == nxv8bf16) of BFMLAL top/bottom
+  // instructions. These result in two f32 vectors, which can be converted back
+  // to bf16 with FCVT and FCVTNT.
+  SDValue BottomF32 = BFMLALB(MVT::nxv4f32, Zero, LHS, RHS);
+  SDValue BottomBF16 =
+      FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
+  // Note: nxv4bf16 only uses even lanes.
+  if (VT == MVT::nxv4bf16)
+    return ReinterpretCast(BottomBF16, VT);
+  SDValue TopF32 = BFMLALT(MVT::nxv4f32, Zero, LHS, RHS);
+  return FCVTNT(VT, BottomBF16, Pg, TopF32);
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -7744,7 +7804,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::FSUB:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
   case ISD::FMUL:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
+    return LowerFMUL(Op, DAG);
   case ISD::FMA:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
   case ISD::FDIV:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index be198e54cbcbf..ca08eb40c956a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -614,6 +614,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFMUL(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
index 2103bc30b8381..6917ac12999bf 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sve,+bf16                         < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16                         < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16,NOB16B16-NONSTREAMING
 ; RUN: llc -mattr=+sve,+bf16,+sve-b16b16             < %s | FileCheck %s --check-prefixes=CHECK,B16B16
-; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming  < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming  < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16,NOB16B16-STREAMING
 ; RUN: llc -mattr=+sme2,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,B16B16
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -530,49 +530,80 @@ define <vscale x 2 x bfloat> @fmul_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
 ; B16B16-NEXT:    ptrue p0.d
 ; B16B16-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
 ; B16B16-NEXT:    ret
-  %res = fmul <vscale x 2 x bfloat> %a, %b
+  %res = fmul nsz <vscale x 2 x bfloat> %a, %b
   ret <vscale x 2 x bfloat> %res
 }
 
 define <vscale x 4 x bfloat> @fmul_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
-; NOB16B16-LABEL: fmul_nxv4bf16:
-; NOB16B16:       // %bb.0:
-; NOB16B16-NEXT:    lsl z1.s, z1.s, #16
-; NOB16B16-NEXT:    lsl z0.s, z0.s, #16
-; NOB16B16-NEXT:    ptrue p0.s
-; NOB16B16-NEXT:    fmul z0.s, z0.s, z1.s
-; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z0.s
-; NOB16B16-NEXT:    ret
+; NOB16B16-NONSTREAMING-LABEL: fmul_nxv4bf16:
+; NOB16B16-NONSTREAMING:       // %bb.0:
+; NOB16B16-NONSTREAMING-NEXT:    movi v2.2d, #0000000000000000
+; NOB16B16-NONSTREAMING-NEXT:    ptrue p0.s
+; NOB16B16-NONSTREAMING-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NONSTREAMING-NEXT:    bfcvt z0.h, p0/m, z2.s
+; NOB16B16-NONSTREAMING-NEXT:    ret
 ;
 ; B16B16-LABEL: fmul_nxv4bf16:
 ; B16B16:       // %bb.0:
 ; B16B16-NEXT:    ptrue p0.s
 ; B16B16-NEXT:    bfmul z0.h, p0/m, z0.h, z1.h
 ; B16B16-NEXT:    ret
-  %res = fmul <vscale x 4 x bfloat> %a, %b
+;
+; NOB16B16-STREAMING-LABEL: fmul_nxv4bf16:
+; NOB16B16-STREAMING:       // %bb.0:
+; NOB16B16-STREAMING-NEXT:    mov z2.s, #0 // =0x0
+; NOB16B16-STREAMING-NEXT:    ptrue p0.s
+; NOB16B16-STREAMING-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-STREAMING-NEXT:    bfcvt z0.h, p0/m, z2.s
+; NOB16B16-STREAMING-NEXT:    ret
+  %res = fmul nsz <vscale x 4 x bfloat> %a, %b
   ret <vscale x 4 x bfloat> %res
 }
 
 define <vscale x 8 x bfloat> @fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
-; NOB16B16-LABEL: fmul_nxv8bf16:
+; NOB16B16-NONSTREAMING-LABEL: fmul_nxv8bf16:
+; NOB16B16-NONSTREAMING:       // %bb.0:
+; NOB16B16-NONSTREAMING-NEXT:    movi v2.2d, #0000000000000000
+; NOB16B16-NONSTREAMING-NEXT:    movi v3.2d, #0000000000000000
+; NOB16B16-NONSTREAMING-NEXT:    ptrue p0.s
+; NOB16B16-NONSTREAMING-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NONSTREAMING-NEXT:    bfmlalt z3.s, z0.h, z1.h
+; NOB16B16-NONSTREAMING-NEXT:    bfcvt z0.h, p0/m, z2.s
+; NOB16B16-NONSTREAMING-NEXT:    bfcvtnt z0.h, p0/m, z3.s
+; NOB16B16-NONSTREAMING-NEXT:    ret
+;
+; B16B16-LABEL: fmul_nxv8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    bfmul z0.h, z0.h, z1.h
+; B16B16-NEXT:    ret
+;
+; NOB16B16-STREAMING-LABEL: fmul_nxv8bf16:
+; NOB16B16-STREAMING:       // %bb.0:
+; NOB16B16-STREAMING-NEXT:    mov z2.s, #0 // =0x0
+; NOB16B16-STREAMING-NEXT:    mov z3.s, #0 // =0x0
+; NOB16B16-STREAMING-NEXT:    ptrue p0.s
+; NOB16B16-STREAMING-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-STREAMING-NEXT:    bfmlalt z3.s, z0.h, z1.h
+; NOB16B16-STREAMING-NEXT:    bfcvt z0.h, p0/m, z2.s
+; NOB16B16-STREAMING-NEXT:    bfcvtnt z0.h, p0/m, z3.s
+; NOB16B16-STREAMING-NEXT:    ret
+  %res = fmul nsz <vscale x 8 x bfloat> %a, %b
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fmul_nxv8bf16_no_nsz(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; NOB16B16-LABEL: fmul_nxv8bf16_no_nsz:
 ; NOB16B16:       // %bb.0:
-; NOB16B16-NEXT:    uunpkhi z2.s, z1.h
-; NOB16B16-NEXT:    uunpkhi z3.s, z0.h
-; NOB16B16-NEXT:    uunpklo z1.s, z1.h
-; NOB16B16-NEXT:    uunpklo z0.s, z0.h
+; NOB16B16-NEXT:    mov z2.s, #0x80000000
+; NOB16B16-NEXT:    mov z3.s, #0x80000000
 ; NOB16B16-NEXT:    ptrue p0.s
-; NOB16B16-NEXT:    lsl z2.s, z2.s, #16
-; NOB16B16-NEXT:    lsl z3.s, z3.s, #16
-; NOB16B16-NEXT:    lsl z1.s, z1.s, #16
-; NOB16B16-NEXT:    lsl z0.s, z0.s, #16
-; NOB16B16-NEXT:    fmul z2.s, z3.s, z2.s
-; NOB16B16-NEXT:    fmul z0.s, z0.s, z1.s
-; NOB16B16-NEXT:    bfcvt z1.h, p0/m, z2.s
-; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z0.s
-; NOB16B16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT:    bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT:    bfmlalt z3.s, z0.h, z1.h
+; NOB16B16-NEXT:    bfcvt z0.h, p0/m, z2.s
+; NOB16B16-NEXT:    bfcvtnt z0.h, p0/m, z3.s
 ; NOB16B16-NEXT:    ret
 ;
-; B16B16-LABEL: fmul_nxv8bf16:
+; B16B16-LABEL: fmul_nxv8bf16_no_nsz:
 ; B16B16:       // %bb.0:
 ; B16B16-NEXT:    bfmul z0.h, z0.h, z1.h
 ; B16B16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index be721930cf015..a8049806a679b 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -410,28 +410,21 @@ define <vscale x 8 x bfloat> @fsub_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a
 define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fadd_sel_fmul_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    movi v3.2d, #0000000000000000
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
-; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    movi v2.2d, #0000000000000000
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
 ; SVE-NEXT:    uunpklo z0.s, z0.h
-; SVE-NEXT:    uunpkhi z2.s, z1.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    sel z1.h, p0, z1.h, z2.h
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fadd z2.s, z3.s, z2.s
@@ -457,24 +450,20 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
 define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fsub_sel_fmul_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fsub z2.s, z3.s, z2.s
 ; SVE-NEXT:    fsub z1.s, z4.s, z1.s
@@ -497,24 +486,20 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nxv8bf16(<vscale x 8 x bfloat> %a, <
 define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fadd z2.s, z3.s, z2.s
 ; SVE-NEXT:    fadd z1.s, z4.s, z1.s
@@ -537,24 +522,20 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
 define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fsub z2.s, z3.s, z2.s
 ; SVE-NEXT:    fsub z1.s, z4.s, z1.s
@@ -577,24 +558,20 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_nsz_nxv8bf16(<vscale x 8 x bfloat> %
 define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fadd z2.s, z3.s, z2.s
 ; SVE-NEXT:    fadd z1.s, z4.s, z1.s
@@ -618,28 +595,21 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
-; SVE-NEXT:    dupm z3.h, #0x8000
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
-; SVE-NEXT:    uzp1 z1.h, z1.h, z2.h
-; SVE-NEXT:    sel z1.h, p0, z1.h, z3.h
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    dupm z2.h, #0x8000
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
 ; SVE-NEXT:    uunpklo z0.s, z0.h
-; SVE-NEXT:    uunpkhi z2.s, z1.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
 ; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z0.s, z0.s, #16
+; SVE-NEXT:    sel z1.h, p0, z1.h, z2.h
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fsub z2.s, z3.s, z2.s
@@ -666,24 +636,20 @@ define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nxv8bf16(<vscale x 8 x bfloa
 define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fadd z2.s, z3.s, z2.s
 ; SVE-NEXT:    fadd z1.s, z4.s, z1.s
@@ -707,24 +673,20 @@ define <vscale x 8 x bfloat> @fadd_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x b
 define <vscale x 8 x bfloat> @fsub_sel_fmul_negzero_nsz_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x i1> %mask) {
 ; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    uunpkhi z3.s, z2.h
-; SVE-NEXT:    uunpkhi z4.s, z1.h
-; SVE-NEXT:    uunpklo z2.s, z2.h
-; SVE-NEXT:    uunpklo z1.s, z1.h
+; SVE-NEXT:    mov z3.s, #0x80000000
+; SVE-NEXT:    mov z4.s, #0x80000000
 ; SVE-NEXT:    ptrue p1.s
-; SVE-NEXT:    lsl z3.s, z3.s, #16
-; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z1.s, z1.s, #16
-; SVE-NEXT:    fmul z3.s, z4.s, z3.s
-; SVE-NEXT:    uunpklo z4.s, z0.h
-; SVE-NEXT:    fmul z1.s, z1.s, z2.s
-; SVE-NEXT:    bfcvt z2.h, p1/m, z3.s
+; SVE-NEXT:    bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT:    bfmlalt z4.s, z1.h, z2.h
+; SVE-NEXT:    bfcvt z1.h, p1/m, z3.s
 ; SVE-NEXT:    uunpkhi z3.s, z0.h
+; SVE-NEXT:    bfcvtnt z1.h, p1/m, z4.s
+; SVE-NEXT:    uunpklo z4.s, z0.h
+; SVE-NEXT:    lsl z3.s, z3.s, #16
+; SVE-NEXT:    uunpkhi z2.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z1.h
 ; SVE-NEXT:    lsl z4.s, z4.s, #16
-; SVE-NEXT:    bfcvt z1.h, p1/m, z1.s
 ; SVE-NEXT:    lsl z2.s, z2.s, #16
-; SVE-NEXT:    lsl z3.s, z3.s, #16
 ; SVE-NEXT:    lsl z1.s, z1.s, #16
 ; SVE-NEXT:    fsub z2.s, z3.s, z2.s
 ; SVE-NEXT:    fsub z1.s, z4.s, z1.s

From 121e2e9e377c1db6b5cb536e7fd0b78244c0ce04 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 24 Nov 2025 11:09:00 +0100
Subject: [PATCH 23/38] [libc++] Introduce
 basic_string::__allocate_long_buffer_for_growing (#162633)

Introducing this utility makes the `__grow_by{,_and_replace}`
significantly easier to understand and allows us to migrate away from
these functions in the future.
---
 libcxx/include/string | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index c4806069d0b44..6b42cb2c7586d 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2384,6 +2384,19 @@ private:
     return __guess;
   }
 
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type
+  __get_amortized_growth_capacity(size_type __required_capacity) {
+    size_type __max_size = max_size();
+    if (__required_capacity > __max_size)
+      __throw_length_error();
+    size_type __current_cap = capacity();
+    _LIBCPP_ASSERT_INTERNAL(
+        __current_cap < __required_capacity, "Trying to grow string even though there is enough capacity already?");
+    if (__current_cap > __max_size / 2 - __alignment)
+      return __max_size;
+    return std::max(__required_capacity, 2 * __current_cap);
+  }
+
   inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz);
   inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(size_type __n, value_type __c);
 
@@ -2714,14 +2727,10 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__
     size_type __n_del,
     size_type __n_add,
     const value_type* __p_new_stuff) {
-  size_type __ms = max_size();
-  if (__delta_cap > __ms - __old_cap)
-    __throw_length_error();
+  __long __buffer = __allocate_long_buffer(__alloc_, __get_amortized_growth_capacity(__old_cap + __delta_cap));
   pointer __old_p = __get_pointer();
-  size_type __cap = __old_cap < __ms / 2 - __alignment ? std::max(__old_cap + __delta_cap, 2 * __old_cap) : __ms;
   __annotate_delete();
-  auto __guard    = std::__make_scope_guard(__annotate_new_size(*this));
-  __long __buffer = __allocate_long_buffer(__alloc_, __cap);
+  auto __guard = std::__make_scope_guard(__annotate_new_size(*this));
   if (__n_copy != 0)
     traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__old_p), __n_copy);
   if (__n_add != 0)
@@ -2751,12 +2760,8 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
     size_type __n_copy,
     size_type __n_del,
     size_type __n_add) {
-  size_type __ms = max_size();
-  if (__delta_cap > __ms - __old_cap)
-    this->__throw_length_error();
+  __long __buffer = __allocate_long_buffer(__alloc_, __get_amortized_growth_capacity(__old_cap + __delta_cap));
   pointer __old_p = __get_pointer();
-  size_type __cap = __old_cap < __ms / 2 - __alignment ? std::max(__old_cap + __delta_cap, 2 * __old_cap) : __ms;
-  __long __buffer = __allocate_long_buffer(__alloc_, __cap);
   if (__n_copy != 0)
     traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__old_p), __n_copy);
   size_type __sec_cp_sz = __old_sz - __n_del - __n_copy;

From f3ce5dec690e11645e0b838132d3306b56c0ec97 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 24 Nov 2025 11:09:40 +0100
Subject: [PATCH 24/38] [libc++] Forward std::all_of and std::none_of to
 std::any_of (#167670)

This allows propagating optimizations to different algorithms by just
optimizing the lowest one. This is especially relevant now that we start
optimizing how we're iterating through ranges (e.g. the segmented
iterator optimizations) and adding assumptions so the compier can better
leverage semantics guaranteed by the standard (e.g.
`__builtin_assume_dereferenceable`).
---
 libcxx/include/__algorithm/all_of.h           |  16 ++-
 libcxx/include/__algorithm/none_of.h          |   8 +-
 .../robust_against_nonbool.compile.pass.cpp   | 136 ++++++++++++++++++
 3 files changed, 151 insertions(+), 9 deletions(-)
 create mode 100644 libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp

diff --git a/libcxx/include/__algorithm/all_of.h b/libcxx/include/__algorithm/all_of.h
index 6acc117fc47bc..9bdb20a0d7b2f 100644
--- a/libcxx/include/__algorithm/all_of.h
+++ b/libcxx/include/__algorithm/all_of.h
@@ -10,24 +10,28 @@
 #ifndef _LIBCPP___ALGORITHM_ALL_OF_H
 #define _LIBCPP___ALGORITHM_ALL_OF_H
 
+#include <__algorithm/any_of.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__type_traits/invoke.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Iter, class _Sent, class _Proj, class _Pred>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool
 __all_of(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) {
-  for (; __first != __last; ++__first) {
-    if (!std::__invoke(__pred, std::__invoke(__proj, *__first)))
-      return false;
-  }
-  return true;
+  using _Ref          = decltype(std::__invoke(__proj, *__first));
+  auto __negated_pred = [&__pred](_Ref __arg) -> bool { return !std::__invoke(__pred, std::forward<_Ref>(__arg)); };
+  return !std::__any_of(std::move(__first), std::move(__last), __negated_pred, __proj);
 }
 
 template <class _InputIterator, class _Predicate>
@@ -39,4 +43,6 @@ all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ALGORITHM_ALL_OF_H
diff --git a/libcxx/include/__algorithm/none_of.h b/libcxx/include/__algorithm/none_of.h
index e6bd197622292..1e1c8d1aad637 100644
--- a/libcxx/include/__algorithm/none_of.h
+++ b/libcxx/include/__algorithm/none_of.h
@@ -10,7 +10,9 @@
 #ifndef _LIBCPP___ALGORITHM_NONE_OF_H
 #define _LIBCPP___ALGORITHM_NONE_OF_H
 
+#include <__algorithm/any_of.h>
 #include <__config>
+#include <__functional/identity.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,10 +23,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _InputIterator, class _Predicate>
 [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool
 none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) {
-  for (; __first != __last; ++__first)
-    if (__pred(*__first))
-      return false;
-  return true;
+  __identity __proj;
+  return !std::__any_of(__first, __last, __pred, __proj);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp b/libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp
new file mode 100644
index 0000000000000..e7c32d244a565
--- /dev/null
+++ b/libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <algorithm>
+//
+// Algorithms that take predicates should support predicates that return a non-boolean value as long as the
+// returned type is implicitly convertible to bool.
+
+#include <algorithm>
+
+#include <initializer_list>
+
+#include "boolean_testable.h"
+
+using Value    = StrictComparable<int>;
+using Iterator = StrictBooleanIterator<Value*>;
+auto pred1     = StrictUnaryPredicate;
+auto pred2     = StrictBinaryPredicate;
+
+void f(Iterator it, Iterator out, std::size_t n, Value const& val, std::initializer_list<Value> ilist) {
+  (void)std::any_of(it, it, pred1);
+  (void)std::all_of(it, it, pred1);
+  (void)std::none_of(it, it, pred1);
+  (void)std::find_if(it, it, pred1);
+  (void)std::find_if_not(it, it, pred1);
+  (void)std::find_first_of(it, it, it, it);
+  (void)std::find_first_of(it, it, it, it, pred2);
+  (void)std::adjacent_find(it, it);
+  (void)std::adjacent_find(it, it, pred2);
+  (void)std::mismatch(it, it, it, it);
+  (void)std::mismatch(it, it, it, it, pred2);
+  (void)std::mismatch(it, it, it);
+  (void)std::mismatch(it, it, it);
+  (void)std::mismatch(it, it, it, pred2);
+  (void)std::equal(it, it, it, it);
+  (void)std::equal(it, it, it, it, pred2);
+  (void)std::equal(it, it, it);
+  (void)std::equal(it, it, it, pred2);
+  (void)std::lexicographical_compare(it, it, it, it);
+  (void)std::lexicographical_compare(it, it, it, it, pred2);
+  (void)std::partition_point(it, it, pred1);
+  (void)std::lower_bound(it, it, val);
+  (void)std::lower_bound(it, it, val, pred2);
+  (void)std::upper_bound(it, it, val);
+  (void)std::upper_bound(it, it, val, pred2);
+  (void)std::equal_range(it, it, val);
+  (void)std::equal_range(it, it, val, pred2);
+  (void)std::binary_search(it, it, val);
+  (void)std::binary_search(it, it, val, pred2);
+  (void)std::min(val, val);
+  (void)std::min(val, val, pred2);
+  (void)std::min(ilist);
+  (void)std::min(ilist, pred2);
+  (void)std::max(val, val);
+  (void)std::max(val, val, pred2);
+  (void)std::max(ilist);
+  (void)std::max(ilist, pred2);
+  (void)std::minmax(val, val);
+  (void)std::minmax(val, val, pred2);
+  (void)std::minmax(ilist);
+  (void)std::minmax(ilist, pred2);
+  (void)std::min_element(it, it);
+  (void)std::min_element(it, it, pred2);
+  (void)std::max_element(it, it);
+  (void)std::max_element(it, it, pred2);
+  (void)std::minmax_element(it, it);
+  (void)std::minmax_element(it, it, pred2);
+  (void)std::count_if(it, it, pred1);
+  (void)std::search(it, it, it, it);
+  (void)std::search(it, it, it, it, pred2);
+  (void)std::search_n(it, it, n, val);
+  (void)std::search_n(it, it, n, val, pred2);
+  (void)std::is_partitioned(it, it, pred1);
+  (void)std::is_sorted(it, it);
+  (void)std::is_sorted(it, it, pred2);
+  (void)std::is_sorted_until(it, it);
+  (void)std::is_sorted_until(it, it, pred2);
+  (void)std::is_heap(it, it);
+  (void)std::is_heap(it, it, pred2);
+  (void)std::is_heap_until(it, it);
+  (void)std::is_heap_until(it, it, pred2);
+  (void)std::clamp(val, val, val);
+  (void)std::clamp(val, val, val, pred2);
+  (void)std::is_permutation(it, it, it, it);
+  (void)std::is_permutation(it, it, it, it, pred2);
+  (void)std::copy_if(it, it, out, pred1);
+  (void)std::remove_copy_if(it, it, out, pred1);
+  (void)std::remove_copy(it, it, out, val);
+  (void)std::replace(it, it, val, val);
+  (void)std::replace_if(it, it, pred1, val);
+  (void)std::replace_copy_if(it, it, out, pred1, val);
+  (void)std::replace_copy(it, it, out, val, val);
+  (void)std::unique_copy(it, it, out, pred2);
+  (void)std::partition_copy(it, it, out, out, pred1);
+  (void)std::partial_sort_copy(it, it, it, it, pred2);
+  (void)std::merge(it, it, it, it, out);
+  (void)std::merge(it, it, it, it, out, pred2);
+  (void)std::set_difference(it, it, it, it, out, pred2);
+  (void)std::set_intersection(it, it, it, it, out, pred2);
+  (void)std::set_symmetric_difference(it, it, it, it, out, pred2);
+  (void)std::set_union(it, it, it, it, out, pred2);
+  (void)std::remove_if(it, it, pred1);
+  (void)std::remove(it, it, val);
+  (void)std::unique(it, it, pred2);
+  (void)std::partition(it, it, pred1);
+  (void)std::stable_partition(it, it, pred1);
+  (void)std::sort(it, it);
+  (void)std::sort(it, it, pred2);
+  (void)std::stable_sort(it, it);
+  (void)std::stable_sort(it, it, pred2);
+  (void)std::partial_sort(it, it, it);
+  (void)std::partial_sort(it, it, it, pred2);
+  (void)std::nth_element(it, it, it);
+  (void)std::nth_element(it, it, it, pred2);
+  (void)std::inplace_merge(it, it, it);
+  (void)std::inplace_merge(it, it, it, pred2);
+  (void)std::make_heap(it, it);
+  (void)std::make_heap(it, it, pred2);
+  (void)std::push_heap(it, it);
+  (void)std::push_heap(it, it, pred2);
+  (void)std::pop_heap(it, it);
+  (void)std::pop_heap(it, it, pred2);
+  (void)std::sort_heap(it, it);
+  (void)std::sort_heap(it, it, pred2);
+  (void)std::prev_permutation(it, it);
+  (void)std::prev_permutation(it, it, pred2);
+  (void)std::next_permutation(it, it);
+  (void)std::next_permutation(it, it, pred2);
+}

From 4b35ff583fbc61074bdf7b1ebf908d31e578f5ac Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 24 Nov 2025 18:14:19 +0800
Subject: [PATCH 25/38] [RISCV] Enable rematerialization for scalar loads
 (#166774)

In some workloads we see an argument passed on the stack where it is
loaded, only for it to be immediately spilled to a different slot on the
stack and then reloaded from that spill slot later on.

We can avoid the unnecessary spill by marking loads as rematerializable
and just directly loading from where the argument was originally passed
on the stack. TargetTransformInfo::isReMaterializableImpl checks to make
sure that any loads are `MI.isDereferenceableInvariantLoad()`, so we
should be able to move the load down to the remat site.

This gives a 14.8% reduction in spills in 544.nab_r on rva23u64 -O3, and
a few other smaller reductions on llvm-test-suite. I didn't find any
benchmarks where the number of spills/reloads increased.

Related: #165761
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.td    |   4 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td   |   2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoF.td   |   2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td |   2 +-
 llvm/test/CodeGen/RISCV/remat.ll           | 169 ++++++++++++++++++++-
 llvm/test/CodeGen/RISCV/rvv/pr95865.ll     |   4 +-
 6 files changed, 174 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9cb53fb27a2d2..84b962b2a8607 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -768,7 +768,7 @@ def BGE  : BranchCC_rri<0b101, "bge">;
 def BLTU : BranchCC_rri<0b110, "bltu">;
 def BGEU : BranchCC_rri<0b111, "bgeu">;
 
-let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in {
+let IsSignExtendingOpW = 1, canFoldAsLoad = 1, isReMaterializable = 1 in {
 def LB  : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
 def LH  : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
 def LW  : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
@@ -889,7 +889,7 @@ def CSRRCI : CSR_ii<0b111, "csrrci">;
 /// RV64I instructions
 
 let Predicates = [IsRV64] in {
-let canFoldAsLoad = 1 in {
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
 def LWU   : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>;
 def LD    : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 4ffe3e62ac501..deacd41e6469a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -71,7 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtD] in {
-let canFoldAsLoad = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index b30f8ec820c15..bd191001b75ec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -330,7 +330,7 @@ class PseudoFROUND<DAGOperand Ty, ValueType vt, ValueType intvt = XLenVT>
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtF] in {
-let canFoldAsLoad = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 1c6a5afcda49b..c172d1739ba61 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -90,7 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasHalfFPLoadStoreMove] in {
-let canFoldAsLoad = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/test/CodeGen/RISCV/remat.ll b/llvm/test/CodeGen/RISCV/remat.ll
index 8490dd0877d30..8a252751165d0 100644
--- a/llvm/test/CodeGen/RISCV/remat.ll
+++ b/llvm/test/CodeGen/RISCV/remat.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O1 -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O1 -mtriple=riscv64 -mattr=+d,+zfh,+zfbfmin -verify-machineinstrs < %s | FileCheck %s
 
 @a = common global i32 0, align 4
 @l = common global i32 0, align 4
@@ -200,3 +200,170 @@ for.end:                                          ; preds = %for.inc, %entry
 }
 
 declare i32 @foo(i32, i32, i32, i32, i32, i32)
+
+define void @remat_load(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, double %8, double %9, double %10, double %11, double %12, double %13, double %14, double %15, i8 %stackarg0, i16 %stackarg1, i32 %stackarg2, i64 %stackarg3, half %stackarg4, bfloat %stackarg5, float %stackarg6, double %stackarg7, ptr %p) nounwind {
+; CHECK-LABEL: remat_load:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -208
+; CHECK-NEXT:    sd ra, 200(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 192(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s1, 184(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 176(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s3, 168(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s4, 160(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s5, 152(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s6, 144(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s7, 136(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s8, 128(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs0, 96(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs1, 88(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs2, 80(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs3, 72(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs4, 64(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs5, 56(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs6, 48(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs7, 40(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs8, 32(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs9, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs10, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fsd fs11, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    fld fa5, 264(sp)
+; CHECK-NEXT:    flw fa4, 256(sp)
+; CHECK-NEXT:    flh fa3, 248(sp)
+; CHECK-NEXT:    flh fa2, 240(sp)
+; CHECK-NEXT:    ld a0, 272(sp)
+; CHECK-NEXT:    lbu a4, 208(sp)
+; CHECK-NEXT:    lh a3, 216(sp)
+; CHECK-NEXT:    lw a2, 224(sp)
+; CHECK-NEXT:    ld a1, 232(sp)
+; CHECK-NEXT:    sb a4, 0(a0)
+; CHECK-NEXT:    sh a3, 0(a0)
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    fsh fa2, 0(a0)
+; CHECK-NEXT:    fsh fa3, 0(a0)
+; CHECK-NEXT:    fsw fa4, 0(a0)
+; CHECK-NEXT:    fsd fa5, 0(a0)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld a0, 272(sp)
+; CHECK-NEXT:    lbu a1, 208(sp)
+; CHECK-NEXT:    sb a1, 0(a0)
+; CHECK-NEXT:    lh a1, 216(sp)
+; CHECK-NEXT:    sh a1, 0(a0)
+; CHECK-NEXT:    lw a1, 224(sp)
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    ld a1, 232(sp)
+; CHECK-NEXT:    sd a1, 0(a0)
+; CHECK-NEXT:    flh fa5, 240(sp)
+; CHECK-NEXT:    fsh fa5, 0(a0)
+; CHECK-NEXT:    flh fa5, 248(sp)
+; CHECK-NEXT:    fsh fa5, 0(a0)
+; CHECK-NEXT:    flw fa5, 256(sp)
+; CHECK-NEXT:    fsw fa5, 0(a0)
+; CHECK-NEXT:    fld fa5, 264(sp)
+; CHECK-NEXT:    fsd fa5, 0(a0)
+; CHECK-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s1, 184(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s2, 176(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s3, 168(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s4, 160(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s5, 152(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s6, 144(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s7, 136(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s8, 128(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s9, 120(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s10, 112(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s11, 104(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs0, 96(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs1, 88(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs2, 80(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs3, 72(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs4, 64(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs5, 56(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs6, 48(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs7, 40(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs8, 32(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs9, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs10, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    fld fs11, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 208
+; CHECK-NEXT:    ret
+entry:
+  ; Add a use of the stack arguments here so that we will have to load them from
+  ; the stack before the inline asm. Otherwise we would be exercising the
+  ; machine scheduler, not rematerialization.
+  store volatile i8 %stackarg0, ptr %p
+  store volatile i16 %stackarg1, ptr %p
+  store volatile i32 %stackarg2, ptr %p
+  store volatile i64 %stackarg3, ptr %p
+  store volatile half %stackarg4, ptr %p
+  store volatile bfloat %stackarg5, ptr %p
+  store volatile float %stackarg6, ptr %p
+  store volatile double %stackarg7, ptr %p
+  tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31},~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
+  ; Now use them after spilling everything to force rematerialization
+  store volatile i8 %stackarg0, ptr %p
+  store volatile i16 %stackarg1, ptr %p
+  store volatile i32 %stackarg2, ptr %p
+  store volatile i64 %stackarg3, ptr %p
+  store volatile half %stackarg4, ptr %p
+  store volatile bfloat %stackarg5, ptr %p
+  store volatile float %stackarg6, ptr %p
+  store volatile double %stackarg7, ptr %p
+  ret void
+}
+
+; We could remat the load of the constant global if we extended the live
+; interval of the high bits of the address.
+
+@const = external constant i32
+define i32 @constglobal_load() nounwind {
+; CHECK-LABEL: constglobal_load:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -112
+; CHECK-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    lui a0, %hi(const)
+; CHECK-NEXT:    lw a0, %lo(const)(a0)
+; CHECK-NEXT:    sd a0, 0(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addiw a0, a0, 1
+; CHECK-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 112
+; CHECK-NEXT:    ret
+entry:
+  %global = load i32, ptr @const
+  tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"()
+  %a = add i32 %global, 1
+  ret i32 %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index ab9849631663c..01d66b344ec2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -40,8 +40,6 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    li t0, 12
 ; CHECK-NEXT:    li s0, 4
 ; CHECK-NEXT:    li t1, 20
-; CHECK-NEXT:    ld a1, 112(sp)
-; CHECK-NEXT:    sd a1, 0(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    andi t3, a4, 1
@@ -142,7 +140,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    j .LBB0_11
 ; CHECK-NEXT:  .LBB0_12: # %for.body7.us.19
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    ld a0, 0(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld a0, 112(sp)
 ; CHECK-NEXT:    vmv.s.x v16, a0
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma

From 9be30e50c2bf878bd15ac8ed1270f1714c32b30f Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Mon, 24 Nov 2025 18:19:15 +0800
Subject: [PATCH 26/38] [clang][LoongArch] Introduce LASX and LSX conversion
 intrinsics (#157819)

This patch introduces the LASX and LSX conversion intrinsics:

- __m256 __lasx_cast_128_s (__m128)
- __m256d __lasx_cast_128_d (__m128d)
- __m256i __lasx_cast_128 (__m128i)
- __m256 __lasx_concat_128_s (__m128, __m128)
- __m256d __lasx_concat_128_d (__m128, __m128d)
- __m256i __lasx_concat_128 (__m128, __m128i)
- __m128 __lasx_extract_128_lo_s (__m256)
- __m128d __lasx_extract_128_lo_d (__m256d)
- __m128i __lasx_extract_128_lo (__m256i)
- __m128 __lasx_extract_128_hi_s (__m256)
- __m128d __lasx_extract_128_hi_d (__m256d)
- __m128i __lasx_extract_128_hi (__m256i)
- __m256 __lasx_insert_128_lo_s (__m256, __m128)
- __m256d __lasx_insert_128_lo_d (__m256d, __m128d)
- __m256i __lasx_insert_128_lo (__m256i, __m128i)
- __m256 __lasx_insert_128_hi_s (__m256, __m128)
- __m256d __lasx_insert_128_hi_d (__m256d, __m128d)
- __m256i __lasx_insert_128_hi (__m256i, __m128i)

Relevant GCC patch:

https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=c2013267642fea4a6e89b826940c8aa80a76089d
---
 .../clang/Basic/BuiltinsLoongArchLASX.def     |  19 ++
 clang/lib/Basic/Targets/LoongArch.cpp         |   1 +
 clang/lib/Headers/lasxintrin.h                | 113 +++++++++++
 .../CodeGen/LoongArch/lasx/builtin-alias.c    | 171 +++++++++++++++++
 clang/test/CodeGen/LoongArch/lasx/builtin.c   | 175 ++++++++++++++++++
 clang/test/Preprocessor/init-loongarch.c      |   3 +
 6 files changed, 482 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
index c4ea46a3bc5b5..a5eee613d5c9e 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -986,3 +986,22 @@ TARGET_BUILTIN(__builtin_lasx_xbnz_b, "iV32Uc", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_h, "iV16Us", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_w, "iV8Ui", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_d, "iV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_cast_128_s, "V8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_cast_128_d, "V4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_cast_128, "V4LLiV2LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128_s, "V8fV4fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128_d, "V4dV2dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128, "V4LLiV2LLiV2LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo_s, "V4fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo_d, "V2dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo, "V2LLiV4LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi_s, "V4fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi_d, "V2dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi, "V2LLiV4LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo_s, "V8fV8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo_d, "V4dV4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo, "V4LLiV4LLiV2LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi_s, "V8fV8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi_d, "V4dV4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi, "V4LLiV4LLiV2LLi", "nc", "lasx")
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 8e29bb745734b..5863af3f3b920 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -242,6 +242,7 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__loongarch_simd_width", "256");
     Builder.defineMacro("__loongarch_sx", Twine(1));
     Builder.defineMacro("__loongarch_asx", Twine(1));
+    Builder.defineMacro("__loongarch_asx_sx_conv", Twine(1));
   } else if (HasFeatureLSX) {
     Builder.defineMacro("__loongarch_simd_width", "128");
     Builder.defineMacro("__loongarch_sx", Twine(1));
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
index 85020d82829e2..83cc4288a990c 100644
--- a/clang/lib/Headers/lasxintrin.h
+++ b/clang/lib/Headers/lasxintrin.h
@@ -10,6 +10,8 @@
 #ifndef _LOONGSON_ASXINTRIN_H
 #define _LOONGSON_ASXINTRIN_H 1
 
+#include <lsxintrin.h>
+
 #if defined(__loongarch_asx)
 
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
@@ -3882,5 +3884,116 @@ extern __inline
 
 #define __lasx_xvrepli_w(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_w((_1)))
 
+#if defined(__loongarch_asx_sx_conv)
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__,
+                   __artificial__)) __m256 __lasx_cast_128_s(__m128 _1) {
+  return (__m256)__builtin_lasx_cast_128_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_cast_128_d(__m128d _1) {
+  return (__m256d)__builtin_lasx_cast_128_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_cast_128(__m128i _1) {
+  return (__m256i)__builtin_lasx_cast_128((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_concat_128_s(__m128 _1, __m128 _2) {
+  return (__m256)__builtin_lasx_concat_128_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_concat_128_d(__m128d _1, __m128d _2) {
+  return (__m256d)__builtin_lasx_concat_128_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_concat_128(__m128i _1, __m128i _2) {
+  return (__m256i)__builtin_lasx_concat_128((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lasx_extract_128_lo_s(__m256 _1) {
+  return (__m128)__builtin_lasx_extract_128_lo_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lasx_extract_128_lo_d(__m256d _1) {
+  return (__m128d)__builtin_lasx_extract_128_lo_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lasx_extract_128_lo(__m256i _1) {
+  return (__m128i)__builtin_lasx_extract_128_lo((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lasx_extract_128_hi_s(__m256 _1) {
+  return (__m128)__builtin_lasx_extract_128_hi_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lasx_extract_128_hi_d(__m256d _1) {
+  return (__m128d)__builtin_lasx_extract_128_hi_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lasx_extract_128_hi(__m256i _1) {
+  return (__m128i)__builtin_lasx_extract_128_hi((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_insert_128_lo_s(__m256 _1, __m128 _2) {
+  return (__m256)__builtin_lasx_insert_128_lo_s((v8f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_insert_128_lo_d(__m256d _1, __m128d _2) {
+  return (__m256d)__builtin_lasx_insert_128_lo_d((v4f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_insert_128_lo(__m256i _1, __m128i _2) {
+  return (__m256i)__builtin_lasx_insert_128_lo((v4i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_insert_128_hi_s(__m256 _1, __m128 _2) {
+  return (__m256)__builtin_lasx_insert_128_hi_s((v8f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_insert_128_hi_d(__m256d _1, __m128d _2) {
+  return (__m256d)__builtin_lasx_insert_128_hi_d((v4f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_insert_128_hi(__m256i _1, __m128i _2) {
+  return (__m256i)__builtin_lasx_insert_128_hi((v4i64)_1, (v2i64)_2);
+}
+
+#endif /* defined(__loongarch_asx_sx_conv).  */
 #endif /* defined(__loongarch_asx).  */
 #endif /* _LOONGSON_ASXINTRIN_H.  */
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
index 03a746c966cdd..4f289ca84c271 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -7120,6 +7120,177 @@ v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
+// CHECK-LABEL: define dso_local void @cast_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 cast_128_s(v4f32 _1) { return __lasx_cast_128_s(_1); }
+// CHECK-LABEL: define dso_local void @cast_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 cast_128_d(v2f64 _1) { return __lasx_cast_128_d(_1); }
+// CHECK-LABEL: define dso_local void @cast_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 cast_128(v2i64 _1) { return __lasx_cast_128(_1); }
+// CHECK-LABEL: define dso_local void @concat_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __lasx_concat_128_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __lasx_concat_128_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 concat_128(v2i64 _1, v2i64 _2) { return __lasx_concat_128(_1, _2); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_lo_s(v8f32 _1) { return __lasx_extract_128_lo_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_lo_d(v4f64 _1) { return __lasx_extract_128_lo_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_lo(v4i64 _1) { return __lasx_extract_128_lo(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_hi_s(v8f32 _1) { return __lasx_extract_128_hi_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_hi_d(v4f64 _1) { return __lasx_extract_128_hi_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_hi(v4i64 _1) { return __lasx_extract_128_hi(_1); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_lo_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_lo_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_lo(v4i64 _1, v2i64 _2) { return __lasx_insert_128_lo(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_hi_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_hi_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_hi(v4i64 _1, v2i64 _2) { return __lasx_insert_128_hi(_1, _2); }
 //.
 // CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
index 700e845cd662a..373efefc9b264 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -1,6 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s
 
+typedef long long v2i64 __attribute__ ((vector_size(16), aligned(16)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef double v2f64 __attribute__((vector_size(16), aligned(16)));
+
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
 typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1)));
 typedef unsigned char v32u8 __attribute__((vector_size(32), aligned(32)));
@@ -7142,6 +7146,177 @@ v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }
+// CHECK-LABEL: define dso_local void @cast_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 cast_128_s(v4f32 _1) { return __builtin_lasx_cast_128_s(_1); }
+// CHECK-LABEL: define dso_local void @cast_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 cast_128_d(v2f64 _1) { return __builtin_lasx_cast_128_d(_1); }
+// CHECK-LABEL: define dso_local void @cast_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> [[TMP0]])
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 cast_128(v2i64 _1) { return __builtin_lasx_cast_128(_1); }
+// CHECK-LABEL: define dso_local void @concat_128_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __builtin_lasx_concat_128_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __builtin_lasx_concat_128_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @concat_128(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 concat_128(v2i64 _1, v2i64 _2) { return __builtin_lasx_concat_128(_1, _2); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_lo_s(v8f32 _1) { return __builtin_lasx_extract_128_lo_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_lo_d(v4f64 _1) { return __builtin_lasx_extract_128_lo_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_lo(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_lo(v4i64 _1) { return __builtin_lasx_extract_128_lo(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_s(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 extract_128_hi_s(v8f32 _1) { return __builtin_lasx_extract_128_hi_s(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi_d(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 extract_128_hi_d(v4f64 _1) { return __builtin_lasx_extract_128_hi_d(_1); }
+// CHECK-LABEL: define dso_local i128 @extract_128_hi(
+// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> [[_1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2i64 extract_128_hi(v4i64 _1) { return __builtin_lasx_extract_128_hi(_1); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __builtin_lasx_insert_128_lo_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __builtin_lasx_insert_128_lo_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_lo(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_lo(v4i64 _1, v2i64 _2) { return __builtin_lasx_insert_128_lo(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_s(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __builtin_lasx_insert_128_hi_s(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi_d(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __builtin_lasx_insert_128_hi_d(_1, _2); }
+// CHECK-LABEL: define dso_local void @insert_128_hi(
+// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> [[_1]], <2 x i64> [[TMP1]])
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+v4i64 insert_128_hi(v4i64 _1, v2i64 _2) { return __builtin_lasx_insert_128_hi(_1, _2); }
 //.
 // CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index fd7ce2073a512..b2c0b51464a80 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -925,6 +925,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
+// MLSX-NOT: #define __loongarch_asx_sx_conv
 // MLSX: #define __loongarch_simd_width 128
 // MLSX: #define __loongarch_sx 1
 
@@ -937,6 +938,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
+// MLASX: #define __loongarch_asx_sx_conv 1
 // MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
 
@@ -953,5 +955,6 @@
 // RUN: %clang --target=loongarch64 -mno-lsx -march=la464 -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
+// MNO-LSX-NOT: #define __loongarch_asx_sx_conv
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx

From 8c6ec1212720ebbbd7dc10e1fea2602a5d58eef5 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Mon, 24 Nov 2025 12:32:29 +0200
Subject: [PATCH 27/38] [libc++][array] Applied `[[nodiscard]]` (#168829)

`[[nodiscard]]` should be applied to functions where discarding the
return value is most likely a correctness issue.

-
https://libcxx.llvm.org/CodingGuidelines.html#apply-nodiscard-where-relevant

---------

Co-authored-by: Hristo Hristov <zingam@outlook.com>
---
 libcxx/include/array                          | 142 +++++++++++-------
 .../diagnostics/array.nodiscard.verify.cpp    |  69 ++++++++-
 .../array/array.creation/to_array.verify.cpp  |   6 +-
 3 files changed, 154 insertions(+), 63 deletions(-)

diff --git a/libcxx/include/array b/libcxx/include/array
index ff46838e2e8e2..0b0c85458999c 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -210,28 +210,28 @@ struct array {
   }
 
   // iterators:
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<_Size>(data(), data());
 #  else
     return iterator(data());
 #  endif
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<_Size>(data(), data());
 #  else
     return const_iterator(data());
 #  endif
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<_Size>(data() + _Size, data());
 #  else
     return iterator(data() + _Size);
 #  endif
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<_Size>(data() + _Size, data());
 #  else
@@ -239,62 +239,81 @@ struct array {
 #  endif
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT {
     return reverse_iterator(end());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator
+  rbegin() const _NOEXCEPT {
     return const_reverse_iterator(end());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT {
     return reverse_iterator(begin());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT {
     return const_reverse_iterator(begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { return end(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT {
+    return begin();
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT {
+    return end();
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator
+  crbegin() const _NOEXCEPT {
     return rbegin();
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { return rend(); }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT {
+    return rend();
+  }
 
   // capacity:
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return _Size; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return _Size; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return _Size; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return _Size; }
   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return _Size == 0; }
 
   // element access:
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type __n) _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type __n) _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < _Size, "out-of-bounds access in std::array<T, N>");
     return __elems_[__n];
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference operator[](size_type __n) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference
+  operator[](size_type __n) const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < _Size, "out-of-bounds access in std::array<T, N>");
     return __elems_[__n];
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type __n) {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type __n) {
     if (__n >= _Size)
       std::__throw_out_of_range("array::at");
     return __elems_[__n];
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type __n) const {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type __n) const {
     if (__n >= _Size)
       std::__throw_out_of_range("array::at");
     return __elems_[__n];
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { return (*this)[0]; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { return (*this)[0]; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { return (*this)[_Size - 1]; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT {
+    return (*this)[0];
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT {
+    return (*this)[0];
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT {
+    return (*this)[_Size - 1];
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT {
     return (*this)[_Size - 1];
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return __elems_; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { return __elems_; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT {
+    return __elems_;
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT {
+    return __elems_;
+  }
 };
 
 template <class _Tp>
@@ -328,8 +347,10 @@ struct array<_Tp, 0> {
   };
   _ALIGNAS_TYPE(_ArrayInStructT) _EmptyType __elems_[sizeof(_ArrayInStructT)];
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return nullptr; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { return nullptr; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return nullptr; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT {
+    return nullptr;
+  }
 
   // No explicit construct/copy/destroy for aggregate type
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void fill(const value_type&) {
@@ -341,28 +362,28 @@ struct array<_Tp, 0> {
   }
 
   // iterators:
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<0>(data(), data());
 #  else
     return iterator(data());
 #  endif
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<0>(data(), data());
 #  else
     return const_iterator(data());
 #  endif
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<0>(data(), data());
 #  else
     return iterator(data());
 #  endif
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT {
 #  if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY)
     return std::__make_static_bounded_iter<0>(data(), data());
 #  else
@@ -370,68 +391,77 @@ struct array<_Tp, 0> {
 #  endif
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT {
     return reverse_iterator(end());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator
+  rbegin() const _NOEXCEPT {
     return const_reverse_iterator(end());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT {
     return reverse_iterator(begin());
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT {
     return const_reverse_iterator(begin());
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { return begin(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { return end(); }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT {
+    return begin();
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT {
+    return end();
+  }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator
+  crbegin() const _NOEXCEPT {
     return rbegin();
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { return rend(); }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT {
+    return rend();
+  }
 
   // capacity:
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return 0; }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return 0; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return 0; }
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return 0; }
   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return true; }
 
   // element access:
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type) _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type) _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array<T, 0>::operator[] on a zero-sized array");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference operator[](size_type) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference
+  operator[](size_type) const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array<T, 0>::operator[] on a zero-sized array");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type) {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type) {
     std::__throw_out_of_range("array<T, 0>::at");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type) const {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type) const {
     std::__throw_out_of_range("array<T, 0>::at");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array<T, 0>::front() on a zero-sized array");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array<T, 0>::front() on a zero-sized array");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array<T, 0>::back() on a zero-sized array");
     __libcpp_unreachable();
   }
 
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array<T, 0>::back() on a zero-sized array");
     __libcpp_unreachable();
   }
@@ -501,25 +531,29 @@ struct tuple_element<_Ip, array<_Tp, _Size> > {
 };
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>& __a) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&
+get(array<_Tp, _Size>& __a) _NOEXCEPT {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array)");
   return __a.__elems_[_Ip];
 }
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>& __a) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&
+get(const array<_Tp, _Size>& __a) _NOEXCEPT {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array)");
   return __a.__elems_[_Ip];
 }
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&& __a) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&&
+get(array<_Tp, _Size>&& __a) _NOEXCEPT {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array &&)");
   return std::move(__a.__elems_[_Ip]);
 }
 
 template <size_t _Ip, class _Tp, size_t _Size>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&& __a) _NOEXCEPT {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&&
+get(const array<_Tp, _Size>&& __a) _NOEXCEPT {
   static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array &&)");
   return std::move(__a.__elems_[_Ip]);
 }
@@ -539,7 +573,7 @@ __to_array_rvalue_impl(_Tp (&&__arr)[_Size], index_sequence<_Index...>) {
 }
 
 template <typename _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI constexpr array<remove_cv_t<_Tp>, _Size>
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr array<remove_cv_t<_Tp>, _Size>
 to_array(_Tp (&__arr)[_Size]) noexcept(is_nothrow_constructible_v<_Tp, _Tp&>) {
   static_assert(!is_array_v<_Tp>, "[array.creation]/1: to_array does not accept multidimensional arrays.");
   static_assert(is_constructible_v<_Tp, _Tp&>, "[array.creation]/1: to_array requires copy constructible elements.");
@@ -547,7 +581,7 @@ to_array(_Tp (&__arr)[_Size]) noexcept(is_nothrow_constructible_v<_Tp, _Tp&>) {
 }
 
 template <typename _Tp, size_t _Size>
-_LIBCPP_HIDE_FROM_ABI constexpr array<remove_cv_t<_Tp>, _Size>
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr array<remove_cv_t<_Tp>, _Size>
 to_array(_Tp (&&__arr)[_Size]) noexcept(is_nothrow_move_constructible_v<_Tp>) {
   static_assert(!is_array_v<_Tp>, "[array.creation]/4: to_array does not accept multidimensional arrays.");
   static_assert(is_move_constructible_v<_Tp>, "[array.creation]/4: to_array requires move constructible elements.");
diff --git a/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp
index 25a2f80b48f02..8e49807732de7 100644
--- a/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp
@@ -11,13 +11,70 @@
 // check that <array> functions are marked [[nodiscard]]
 
 #include <array>
+#include <utility>
 
-void array_test() {
-  std::array<int, 1> array;
-  array.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#include <test_macros.h>
+
+template <std::size_t N>
+void test_members() {
+  std::array<int, N> a;
+  const std::array<int, N> ca{};
+
+  a.begin();    // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.begin();   // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.end();      // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.end();     // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.rbegin();   // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.rbegin();  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.rend();     // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.rend();    // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.cbegin();   // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.cbegin();  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.cend();     // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.cend();    // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.crbegin();  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.crbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.crend();    // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.crend();   // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  a.size();     // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.max_size(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.empty();    // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  a[0];     // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca[0];    // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.at(0);  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.at(0); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  a.front();  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.front(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  a.back();   // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.back();  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  a.data();  // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ca.data(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}}
+}
+
+template <typename ArrT>
+void test_get() {
+  std::array<int, 94> a{};
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::get<0>(a);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::get<0>(std::move(a));
+}
+
+#if TEST_STD_VER >= 20
+void test_to_array() {
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_array("zmt");
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_array({94, 82, 49});
 }
+#endif
 
-void empty_array_test() {
-  std::array<int, 0> array;
-  array.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+void test() {
+  test_members<0>();
+  test_members<82>();
 }
diff --git a/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp b/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp
index e3efef988f0f4..ee8d8e6cfb2e5 100644
--- a/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp
@@ -22,21 +22,21 @@ int main(int, char**) {
     // expected-error@array:* {{to_array does not accept multidimensional arrays}}
     // expected-error@array:* {{to_array requires copy constructible elements}}
     // expected-error@array:* 3 {{cannot initialize}}
-    std::to_array(source); // expected-note {{requested here}}
+    (void)std::to_array(source); // expected-note {{requested here}}
   }
 
   {
     MoveOnly mo[] = {MoveOnly{3}};
     // expected-error@array:* {{to_array requires copy constructible elements}}
     // expected-error-re@array:* 1-2{{{{(call to implicitly-deleted copy constructor of 'MoveOnly')|(call to deleted constructor of 'MoveOnly')}}}}
-    std::to_array(mo); // expected-note {{requested here}}
+    (void)std::to_array(mo); // expected-note {{requested here}}
   }
 
   {
     const MoveOnly cmo[] = {MoveOnly{3}};
     // expected-error@array:* {{to_array requires move constructible elements}}
     // expected-error-re@array:* 0-1{{{{(call to implicitly-deleted copy constructor of 'MoveOnly')|(call to deleted constructor of 'MoveOnly')}}}}
-    std::to_array(std::move(cmo)); // expected-note {{requested here}}
+    (void)std::to_array(std::move(cmo)); // expected-note {{requested here}}
   }
 
   return 0;

From 4c4cf71095c4e2e2063793d889db3ca984dd375e Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Mon, 24 Nov 2025 12:32:50 +0200
Subject: [PATCH 28/38] [libc++][list] Applied `[[nodiscard]]` (#169015)

`[[nodiscard]]` should be applied to functions where discarding the
return value is most likely a correctness issue.
-
https://libcxx.llvm.org/CodingGuidelines.html#apply-nodiscard-where-relevant
---
 libcxx/include/list                           | 52 ++++++++++++-------
 .../diagnostics/list.nodiscard.verify.cpp     | 31 ++++++++++-
 2 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/libcxx/include/list b/libcxx/include/list
index 2898a45da0029..a5c84cad51489 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -774,57 +774,71 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __x);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT;
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT;
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return this->__size_; }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT {
+    return this->__size_;
+  }
   [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
     return __base::empty();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return std::min<size_type>(this->__node_alloc_max_size(), numeric_limits<difference_type >::max());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __base::begin(); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __base::begin(); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __base::end(); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __base::end(); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
+    return __base::begin();
+  }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
+    return __base::begin();
+  }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT {
+    return __base::end();
+  }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
+    return __base::end();
+  }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
     return __base::begin();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return __base::end(); }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
+    return __base::end();
+  }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT {
     return reverse_iterator(end());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator
+  rbegin() const _NOEXCEPT {
     return const_reverse_iterator(end());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT {
     return reverse_iterator(begin());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT {
     return const_reverse_iterator(begin());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator
+  crbegin() const _NOEXCEPT {
     return const_reverse_iterator(end());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT {
     return const_reverse_iterator(begin());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list");
     return __base::__end_.__next_->__as_node()->__get_value();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list");
     return __base::__end_.__next_->__as_node()->__get_value();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference back() {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference back() {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list");
     return __base::__end_.__prev_->__as_node()->__get_value();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference back() const {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference back() const {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list");
     return __base::__end_.__prev_->__as_node()->__get_value();
   }
diff --git a/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp
index f19224a71f5cc..bfce9b85ef76c 100644
--- a/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp
@@ -13,6 +13,33 @@
 #include <list>
 
 void test() {
-  std::list<int> list;
-  list.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::list<int> l;
+  const std::list<int> cl;
+
+  l.get_allocator(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.size();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.empty();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.max_size();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  l.begin();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.begin();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.end();      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.end();     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.cbegin();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.cbegin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.cend();     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.cend();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.rbegin();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.rbegin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.rend();     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.rend();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.crbegin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.crbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.crend();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.crend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  l.front();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  l.back();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cl.back();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 }

From 74a62b12b0461cf051dbb4c3842fdccad305411b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 24 Nov 2025 10:38:49 +0000
Subject: [PATCH 29/38] [X86] avx2-builtins.c - add constexpr test coverage for
 _mm256_bslli_epi128/_mm256_bsrli_epi128 intrinsics (#169309)

---
 clang/test/CodeGen/X86/avx2-builtins.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index d22f2f8be8be3..13ad0545ab53f 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -305,12 +305,16 @@ __m256i test_mm256_bslli_epi128(__m256i a) {
   // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   return _mm256_bslli_epi128(a, 3);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_bslli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29));
+TEST_CONSTEXPR(match_v32qi(_mm256_bslli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test_mm256_bsrli_epi128(__m256i a) {
   // CHECK-LABEL: test_mm256_bsrli_epi128
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   return _mm256_bsrli_epi128(a, 3);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_bsrli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 0, 0));
+TEST_CONSTEXPR(match_v32qi(_mm256_bsrli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi8

From d44d329c0b2f13bd1c259c822f5c4fc47d1240d5 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 24 Nov 2025 11:48:00 +0100
Subject: [PATCH 30/38] [clang][bytecode] Fix compound assign operators for
 IntAP(S) (#169303)

We didn't take `IntAP`/`IntAPS` into account when casting to and from
the computation LHS type. This broke the
`std/ranges/range.factories/range.iota.view/end.pass.cpp` test.
---
 clang/lib/AST/ByteCode/Compiler.cpp | 28 ++++++++++++++------
 clang/lib/AST/ByteCode/Compiler.h   |  2 ++
 clang/test/AST/ByteCode/intap.cpp   | 40 +++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 958373ed94fe3..dd0b8e790d444 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2827,10 +2827,10 @@ bool Compiler<Emitter>::VisitCompoundAssignOperator(
     return false;
   if (!this->emitLoad(*LT, E))
     return false;
-  if (LT != LHSComputationT) {
-    if (!this->emitCast(*LT, *LHSComputationT, E))
-      return false;
-  }
+  if (LT != LHSComputationT &&
+      !this->emitIntegralCast(*LT, *LHSComputationT, E->getComputationLHSType(),
+                              E))
+    return false;
 
   // Get the RHS value on the stack.
   if (!this->emitGetLocal(*RT, TempOffset, E))
@@ -2883,10 +2883,9 @@ bool Compiler<Emitter>::VisitCompoundAssignOperator(
   }
 
   // And now cast from LHSComputationT to ResultT.
-  if (ResultT != LHSComputationT) {
-    if (!this->emitCast(*LHSComputationT, *ResultT, E))
-      return false;
-  }
+  if (ResultT != LHSComputationT &&
+      !this->emitIntegralCast(*LHSComputationT, *ResultT, E->getType(), E))
+    return false;
 
   // And store the result in LHS.
   if (DiscardResult) {
@@ -7243,6 +7242,19 @@ bool Compiler<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
   return false;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::emitIntegralCast(PrimType FromT, PrimType ToT,
+                                         QualType ToQT, const Expr *E) {
+  assert(FromT != ToT);
+
+  if (ToT == PT_IntAP)
+    return this->emitCastAP(FromT, Ctx.getBitWidth(ToQT), E);
+  if (ToT == PT_IntAPS)
+    return this->emitCastAPS(FromT, Ctx.getBitWidth(ToQT), E);
+
+  return this->emitCast(FromT, ToT, E);
+}
+
 /// Emits __real(SubExpr)
 template <class Emitter>
 bool Compiler<Emitter>::emitComplexReal(const Expr *SubExpr) {
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 6a9fe1e954530..54d39bbc25952 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -393,6 +393,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   }
 
   bool emitPrimCast(PrimType FromT, PrimType ToT, QualType ToQT, const Expr *E);
+  bool emitIntegralCast(PrimType FromT, PrimType ToT, QualType ToQT,
+                        const Expr *E);
   PrimType classifyComplexElementType(QualType T) const {
     assert(T->isAnyComplexType());
 
diff --git a/clang/test/AST/ByteCode/intap.cpp b/clang/test/AST/ByteCode/intap.cpp
index 05ab319bf16df..efb60cb0abffe 100644
--- a/clang/test/AST/ByteCode/intap.cpp
+++ b/clang/test/AST/ByteCode/intap.cpp
@@ -305,6 +305,46 @@ namespace UnderlyingInt128 {
   static_assert(foo() == 0, ""); // both-error {{not an integral constant expression}} \
                                  // both-note {{in call to}}
 }
+
+namespace CompoundAssignOperators {
+  constexpr unsigned __int128 foo() {
+    long b = 10;
+
+    b += (__int128)1;
+    b -= (__int128)1;
+    b *= (__int128)1;
+    b /= (__int128)1;
+
+    b += (unsigned __int128)1;
+    b -= (unsigned __int128)1;
+    b *= (unsigned __int128)1;
+    b /= (unsigned __int128)1;
+
+    __int128 i = 10;
+    i += (__int128)1;
+    i -= (__int128)1;
+    i *= (__int128)1;
+    i /= (__int128)1;
+    i += (unsigned __int128)1;
+    i -= (unsigned __int128)1;
+    i *= (unsigned __int128)1;
+    i /= (unsigned __int128)1;
+
+    unsigned __int128 i2 = 10;
+    i2 += (__int128)1;
+    i2 -= (__int128)1;
+    i2 *= (__int128)1;
+    i2 /= (__int128)1;
+    i2 += (unsigned __int128)1;
+    i2 -= (unsigned __int128)1;
+    i2 *= (unsigned __int128)1;
+    i2 /= (unsigned __int128)1;
+
+    return (int)b;
+  }
+  static_assert(foo() == 10);
+}
+
 #endif
 
 #endif

From c73de9777e67df4411020a7909f0eadbbf1de08b Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Mon, 24 Nov 2025 11:12:06 +0000
Subject: [PATCH 31/38] [IVDesciptors] Support detecting reductions with vector
 instructions. (#166353)

In combination with https://github.com/llvm/llvm-project/pull/149470
this will introduce parallel accumulators when unrolling reductions with
vector instructions. See also
https://github.com/llvm/llvm-project/pull/166630, which aims to
introduce parallel accumulators for FP reductions.
---
 llvm/lib/Analysis/IVDescriptors.cpp           |  6 +-
 .../LoopUnroll/partial-unroll-reductions.ll   | 54 +++++++++++++++
 .../LoopUnroll/runtime-unroll-reductions.ll   | 67 +++++++++++++++++++
 3 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 641850b46bbd8..26c7f805bdb6d 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -270,10 +270,12 @@ bool RecurrenceDescriptor::AddReductionVar(
   // resulting from the type promotion performed by InstCombine.  Vector
   // operations are not limited to the legal integer widths, so we may be able
   // to evaluate the reduction in the narrower width.
-  if (RecurrenceType->isFloatingPointTy()) {
+  // Check the scalar type to handle both scalar and vector types.
+  Type *ScalarTy = RecurrenceType->getScalarType();
+  if (ScalarTy->isFloatingPointTy()) {
     if (!isFloatingPointRecurrenceKind(Kind))
       return false;
-  } else if (RecurrenceType->isIntegerTy()) {
+  } else if (ScalarTy->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
       return false;
     if (!isMinMaxRecurrenceKind(Kind))
diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
index 2d48d20ba9c5c..220a4a29a3041 100644
--- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll
@@ -358,6 +358,7 @@ loop:
 exit:
   ret float %rdx.next
 }
+
 define i32 @test_smin(ptr %src, i64 %n) {
 ; CHECK-LABEL: define i32 @test_smin(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
@@ -623,3 +624,56 @@ loop:
 exit:
   ret i32 %rdx.next
 }
+
+define <4 x i32> @test_vector_add(ptr %p, i64 %n, <4 x i32> %start) {
+; CHECK-LABEL: define <4 x i32> @test_vector_add(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], <4 x i32> [[START:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi <4 x i32> [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load <4 x i32>, ptr [[GEP]], align 16
+; CHECK-NEXT:    [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[L]]
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_1:%.*]] = load <4 x i32>, ptr [[GEP_1]], align 16
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[L_1]]
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[L_2:%.*]] = load <4 x i32>, ptr [[GEP_2]], align 16
+; CHECK-NEXT:    [[RDX_NEXT_2]] = add <4 x i32> [[RDX_2]], [[L_2]]
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[L_3:%.*]] = load <4 x i32>, ptr [[GEP_3]], align 16
+; CHECK-NEXT:    [[RDX_NEXT_3]] = add <4 x i32> [[RDX_3]], [[L_3]]
+; CHECK-NEXT:    [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
+; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi <4 x i32> [ [[RDX_NEXT_3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]]
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = add <4 x i32> [[RDX_NEXT_2]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[RDX_NEXT_3]], [[BIN_RDX1]]
+; CHECK-NEXT:    ret <4 x i32> [[BIN_RDX2]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi <4 x i32> [ %start, %entry ], [ %rdx.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep = getelementptr inbounds nuw <4 x i32>, ptr %p, i64 %iv
+  %l = load <4 x i32>, ptr %gep, align 16
+  %rdx.next = add <4 x i32> %rdx, %l
+  %ec = icmp ne i64 %iv.next, 1000
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret <4 x i32> %rdx.next
+}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
index a5ac2cf46653d..fb1f2fcf5c190 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll
@@ -220,6 +220,72 @@ exit:
   ret i32 %res
 }
 
+define <4 x i32> @test_vector_add_reduction(ptr %a, i64 %n) {
+; CHECK-LABEL: define <4 x i32> @test_vector_add_reduction(
+; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
+; CHECK:       [[ENTRY_NEW]]:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP_A]], align 16
+; CHECK-NEXT:    [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[TMP2]]
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[GEP_A_1]], align 16
+; CHECK-NEXT:    [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[TMP3]]
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT_UNR_LCSSA]]:
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX_UNR:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_EPIL_PREHEADER]]:
+; CHECK-NEXT:    [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[RDX_EPIL_INIT:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ]
+; CHECK-NEXT:    [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[LCMP_MOD2]])
+; CHECK-NEXT:    br label %[[LOOP_EPIL:.*]]
+; CHECK:       [[LOOP_EPIL]]:
+; CHECK-NEXT:    [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_EPIL_INIT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[GEP_A_EPIL]], align 16
+; CHECK-NEXT:    [[RDX_NEXT_EPIL:%.*]] = add <4 x i32> [[RDX_EPIL_INIT]], [[TMP4]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi <4 x i32> [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %rdx = phi <4 x i32> [ zeroinitializer, %entry ], [ %rdx.next, %loop ]
+  %gep.a = getelementptr inbounds nuw <4 x i32>, ptr %a, i64 %iv
+  %1 = load <4 x i32>, ptr %gep.a, align 16
+  %rdx.next = add <4 x i32> %rdx, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %res = phi <4 x i32> [ %rdx.next, %loop ]
+  ret <4 x i32> %res
+}
 
 
 !0 = distinct !{!0, !1}
@@ -234,4 +300,5 @@ exit:
 ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
 ;.

From 840a43bbe3a7361f99e9444dfcfd9eefe60ba487 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 24 Nov 2025 11:25:17 +0000
Subject: [PATCH 32/38] [libcxx][ci] Temporarily disable ARM jobs (#169318)

Linaro is doing network maintenance and I don't have an estimated time
these will be back online.
---
 libcxx/utils/ci/buildkite-pipeline.yml | 115 +++++++++++++------------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 2ac69c38ebffa..8b77a06323e3d 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -33,63 +33,64 @@ definitions:
       - "**/CMakeOutput.log"
 
 steps:
-- group: ARM
-  steps:
-  - label: AArch64
-    command: libcxx/utils/ci/run-buildbot aarch64
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: aarch64
-    <<: *common
-
-  - label: AArch64 -fno-exceptions
-    command: libcxx/utils/ci/run-buildbot aarch64-no-exceptions
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: aarch64
-    <<: *common
-
-  - label: Armv8
-    command: libcxx/utils/ci/run-buildbot armv8
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: armv8l
-    <<: *common
-
-  - label: Armv8 -fno-exceptions
-    command: libcxx/utils/ci/run-buildbot armv8-no-exceptions
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: armv8l
-    <<: *common
-
-  - label: Armv7
-    command: libcxx/utils/ci/run-buildbot armv7
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: armv8l
-    <<: *common
-
-  - label: Armv7 -fno-exceptions
-    command: libcxx/utils/ci/run-buildbot armv7-no-exceptions
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: armv8l
-    <<: *common
-
-  - label: Armv7-M picolibc
-    command: libcxx/utils/ci/run-buildbot armv7m-picolibc
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: aarch64
-    <<: *common
-
-  - label: Armv7-M picolibc -fno-exceptions
-    command: libcxx/utils/ci/run-buildbot armv7m-picolibc-no-exceptions
-    agents:
-      queue: libcxx-builders-linaro-arm
-      arch: aarch64
-    <<: *common
+# Linaro's ARM builders are temporarily offline.
+#- group: ARM
+#  steps:
+#  - label: AArch64
+#    command: libcxx/utils/ci/run-buildbot aarch64
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: aarch64
+#    <<: *common
+#
+#  - label: AArch64 -fno-exceptions
+#    command: libcxx/utils/ci/run-buildbot aarch64-no-exceptions
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: aarch64
+#    <<: *common
+#
+#  - label: Armv8
+#    command: libcxx/utils/ci/run-buildbot armv8
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: armv8l
+#    <<: *common
+#
+#  - label: Armv8 -fno-exceptions
+#    command: libcxx/utils/ci/run-buildbot armv8-no-exceptions
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: armv8l
+#    <<: *common
+#
+#  - label: Armv7
+#    command: libcxx/utils/ci/run-buildbot armv7
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: armv8l
+#    <<: *common
+#
+#  - label: Armv7 -fno-exceptions
+#    command: libcxx/utils/ci/run-buildbot armv7-no-exceptions
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: armv8l
+#    <<: *common
+#
+#  - label: Armv7-M picolibc
+#    command: libcxx/utils/ci/run-buildbot armv7m-picolibc
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: aarch64
+#    <<: *common
+#
+#  - label: Armv7-M picolibc -fno-exceptions
+#    command: libcxx/utils/ci/run-buildbot armv7m-picolibc-no-exceptions
+#    agents:
+#      queue: libcxx-builders-linaro-arm
+#      arch: aarch64
+#    <<: *common
 
 - group: AIX
   steps:

From fe9c8e4f10c74990a06f8837e4a45c56f725cb65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ingo=20M=C3=BCller?= <ingomueller@google.com>
Date: Mon, 24 Nov 2025 12:36:05 +0100
Subject: [PATCH 33/38] [mlir:x86vector:transform] Fix bazel build (again)
 after #168074. (#169316)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a second attempt to fix the bazel build (after the first in
#169294, which was accidentally merged before CI passed). In the first
attempt, not all bazel dependencies had been added; this PR should add
them all and make CI pass.

Signed-off-by: Ingo Müller <ingomueller@google.com>
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b7fe0fa898c21..6d2eedbfe2415 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2610,8 +2610,13 @@ cc_library(
     hdrs = glob(["include/mlir/Dialect/X86Vector/TransformOps/*.h"]),
     includes = ["include"],
     deps = [
+        ":IR",
+        ":LLVMCommonConversion",
+        ":LLVMDialect",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":VectorDialect",
+        ":X86VectorDialect",
         ":X86VectorTransformOpsIncGen",
         ":X86VectorTransforms",
     ],
@@ -2628,6 +2633,9 @@ cc_library(
         ":LLVMCommonConversion",
         ":LLVMDialect",
         ":LinalgDialect",
+        ":LinalgInterfaces",
+        ":Pass",
+        ":TransformUtils",
         ":VectorDialect",
         ":VectorUtils",
         ":X86VectorDialect",

From 5d2fc9408e99201a32f090ba263de05a362dfa2b Mon Sep 17 00:00:00 2001
From: Meredith Julian <35236176+mjulian31@users.noreply.github.com>
Date: Mon, 24 Nov 2025 03:48:32 -0800
Subject: [PATCH 34/38] [InstCombine] Fix phi scalarization with binop
 (#169120)

InstCombine phi scalarization would always create a new binary op with
the phi as the first operand, which is not correct for non-commutable
binary ops such as sub. This fix preserves the original binary op
ordering in the new binary op and adds a test for this behavior.
Currently, this transformation can produce silently incorrect IR, and in
the case of the added test, would optimize it out entirely.
---
 .../InstCombine/InstCombineVectorOps.cpp      | 15 ++++---
 .../Transforms/InstCombine/scalarization.ll   | 44 +++++++++++++++++++
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 18a45c6799bac..98e2d9ebe4fc2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -140,8 +140,8 @@ Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
     Value *Elt = EI.getIndexOperand();
     // If the operand is the PHI induction variable:
     if (PHIInVal == PHIUser) {
-      // Scalarize the binary operation. Its first operand is the
-      // scalar PHI, and the second operand is extracted from the other
+      // Scalarize the binary operation. One operand is the
+      // scalar PHI, and the other is extracted from the other
       // vector operand.
       BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
       unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
@@ -149,9 +149,14 @@ Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
           ExtractElementInst::Create(B0->getOperand(opId), Elt,
                                      B0->getOperand(opId)->getName() + ".Elt"),
           B0->getIterator());
-      Value *newPHIUser = InsertNewInstWith(
-          BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
-                                                scalarPHI, Op, B0), B0->getIterator());
+      // Preserve operand order for binary operation to preserve semantics of
+      // non-commutative operations.
+      Value *FirstOp = (B0->getOperand(0) == PN) ? scalarPHI : Op;
+      Value *SecondOp = (B0->getOperand(0) == PN) ? Op : scalarPHI;
+      Value *newPHIUser =
+          InsertNewInstWith(BinaryOperator::CreateWithCopiedFlags(
+                                B0->getOpcode(), FirstOp, SecondOp, B0),
+                            B0->getIterator());
       scalarPHI->addIncoming(newPHIUser, inBB);
     } else {
       // Scalarize PHI input:
diff --git a/llvm/test/Transforms/InstCombine/scalarization.ll b/llvm/test/Transforms/InstCombine/scalarization.ll
index a6931b4c41d2d..c4adf756f7756 100644
--- a/llvm/test/Transforms/InstCombine/scalarization.ll
+++ b/llvm/test/Transforms/InstCombine/scalarization.ll
@@ -108,6 +108,50 @@ for.end:
   ret void
 }
 
+define void @scalarize_phi_sub(ptr %n, ptr %inout) {
+;
+; CHECK-LABEL: @scalarize_phi_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T0:%.*]] = load volatile float, ptr [[INOUT:%.*]], align 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ [[T0]], [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[T1:%.*]] = load i32, ptr [[N:%.*]], align 4
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[I_0]], [[T1]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    store volatile float [[TMP0]], ptr [[INOUT]], align 4
+; CHECK-NEXT:    [[TMP1]] = fsub float 0.000000e+00, [[TMP0]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t0 = load volatile float, ptr %inout, align 4
+  %insert = insertelement <4 x float> poison, float %t0, i32 0
+  %splat = shufflevector <4 x float> %insert, <4 x float> poison, <4 x i32> zeroinitializer
+  br label %for.cond
+
+for.cond:
+  %x.0 = phi <4 x float> [ %splat, %entry ], [ %sub, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %t1 = load i32, ptr %n, align 4
+  %cmp = icmp ne i32 %i.0, %t1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %t2 = extractelement <4 x float> %x.0, i32 1
+  store volatile float %t2, ptr %inout, align 4
+  %sub = fsub <4 x float> zeroinitializer, %x.0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
 define float @extract_element_binop_splat_constant_index(<4 x float> %x) {
 ;
 ; CHECK-LABEL: @extract_element_binop_splat_constant_index(

From d162c91c01a66e4af0af190044961e60db0eeb3d Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 24 Nov 2025 22:50:29 +1100
Subject: [PATCH 35/38] [ORC] Avoid self-dependence in SuperNode dependence
 graph. (#169286)

Avoid adding any given SuperNode SN to its own SuperNode-deps set. This
saves us from trying to redundantly merge its dependencies back into
itself (a no-op, but a potentially expensive one).
---
 llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
index f9995917363f9..9f14c8b2efd5f 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
@@ -500,7 +500,7 @@ template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
         if (I == SN->Deps.end())
           continue;
         for (auto &[DefElem, DefSN] : DefElems)
-          if (I->second.erase(DefElem))
+          if (I->second.erase(DefElem) && DefSN != SN.get())
             SNDeps.insert(DefSN);
         if (I->second.empty())
           SN->Deps.erase(I);
@@ -511,11 +511,13 @@ template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
   // Compute transitive closure of deps for each node.
   static void propagateSuperNodeDeps(SuperNodeDepsMap &SuperNodeDeps) {
     for (auto &[SN, Deps] : SuperNodeDeps) {
-      DenseSet<SuperNode *> Reachable({SN});
+      DenseSet<SuperNode *> Reachable;
       SmallVector<SuperNode *> Worklist(Deps.begin(), Deps.end());
 
       while (!Worklist.empty()) {
         auto *DepSN = Worklist.pop_back_val();
+        if (DepSN == SN)
+          continue;
         if (!Reachable.insert(DepSN).second)
           continue;
         auto I = SuperNodeDeps.find(DepSN);
@@ -537,9 +539,11 @@ template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
       if (I == SuperNodeDeps.end())
         continue;
 
-      for (auto *DepSN : I->second)
+      for (auto *DepSN : I->second) {
+        assert(DepSN != SN.get() && "Unexpected self-dependence for SN");
         for (auto &[Container, Elems] : DepSN->Deps)
           SN->Deps[Container].insert(Elems.begin(), Elems.end());
+      }
     }
   }
 

From 1dc6ad008164353e05bfe857f905028827834dbb Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 24 Nov 2025 11:59:05 +0000
Subject: [PATCH 36/38] [lldb] Show signal number description (#164176)

show information about the signal when the user presses `process handle
<unix-signal>` i.e

```sh
(lldb) process handle SIGWINCH
NAME         PASS   STOP   NOTIFY  DESCRIPTION
===========  =====  =====  ======  ===================
SIGWINCH     true   false  false   window size changes
```

Wanted to use the existing `GetSignalDescription` but it is expected
behaviour to return the signal name if no signal code is passed. It is
used in stop info.


https://github.com/llvm/llvm-project/blob/65c895dfe084860847e9e220ff9f1b283ebcb289/lldb/source/Target/StopInfo.cpp#L1192-L1195
---
 lldb/include/lldb/Target/UnixSignals.h        |  2 ++
 lldb/source/Commands/CommandObjectProcess.cpp | 13 ++++++++++---
 lldb/source/Target/UnixSignals.cpp            |  7 +++++++
 lldb/unittests/Signals/UnixSignalsTest.cpp    | 12 ++++++++++++
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/lldb/include/lldb/Target/UnixSignals.h b/lldb/include/lldb/Target/UnixSignals.h
index a1807d69f329b..590e4d1aa5208 100644
--- a/lldb/include/lldb/Target/UnixSignals.h
+++ b/lldb/include/lldb/Target/UnixSignals.h
@@ -31,6 +31,8 @@ class UnixSignals {
 
   llvm::StringRef GetSignalAsStringRef(int32_t signo) const;
 
+  llvm::StringRef GetSignalNumberDescription(int32_t signo) const;
+
   std::string
   GetSignalDescription(int32_t signo,
                        std::optional<int32_t> code = std::nullopt,
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 7d326404a5503..c17f12fae6e79 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -1603,8 +1603,8 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
   void PrintSignalHeader(Stream &str) {
-    str.Printf("NAME         PASS   STOP   NOTIFY\n");
-    str.Printf("===========  =====  =====  ======\n");
+    str.Printf("NAME         PASS   STOP   NOTIFY  DESCRIPTION\n");
+    str.Printf("===========  =====  =====  ======  ===================\n");
   }
 
   void PrintSignal(Stream &str, int32_t signo, llvm::StringRef sig_name,
@@ -1615,9 +1615,16 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
 
     str.Format("{0, -11}  ", sig_name);
     if (signals_sp->GetSignalInfo(signo, suppress, stop, notify)) {
-      bool pass = !suppress;
+      const bool pass = !suppress;
       str.Printf("%s  %s  %s", (pass ? "true " : "false"),
                  (stop ? "true " : "false"), (notify ? "true " : "false"));
+
+      const llvm::StringRef sig_description =
+          signals_sp->GetSignalNumberDescription(signo);
+      if (!sig_description.empty()) {
+        str.PutCString("   ");
+        str.PutCString(sig_description);
+      }
     }
     str.Printf("\n");
   }
diff --git a/lldb/source/Target/UnixSignals.cpp b/lldb/source/Target/UnixSignals.cpp
index 6113c6648817c..881431f4631e5 100644
--- a/lldb/source/Target/UnixSignals.cpp
+++ b/lldb/source/Target/UnixSignals.cpp
@@ -137,6 +137,13 @@ llvm::StringRef UnixSignals::GetSignalAsStringRef(int32_t signo) const {
   return pos->second.m_name;
 }
 
+llvm::StringRef UnixSignals::GetSignalNumberDescription(int32_t signo) const {
+  const auto pos = m_signals.find(signo);
+  if (pos == m_signals.end())
+    return {};
+  return pos->second.m_description;
+}
+
 std::string UnixSignals::GetSignalDescription(
     int32_t signo, std::optional<int32_t> code,
     std::optional<lldb::addr_t> addr, std::optional<lldb::addr_t> lower,
diff --git a/lldb/unittests/Signals/UnixSignalsTest.cpp b/lldb/unittests/Signals/UnixSignalsTest.cpp
index 582e441556067..3bd4aedd600a3 100644
--- a/lldb/unittests/Signals/UnixSignalsTest.cpp
+++ b/lldb/unittests/Signals/UnixSignalsTest.cpp
@@ -148,6 +148,18 @@ TEST(UnixSignalsTest, GetAsString) {
             signals.GetSignalDescription(16, 3, 0x1233, 0x1234, 0x5678));
 }
 
+TEST(UnixSignalsTest, GetNumberDescription) {
+  TestSignals signals;
+
+  ASSERT_EQ("DESC2", signals.GetSignalNumberDescription(2));
+  ASSERT_EQ("DESC4", signals.GetSignalNumberDescription(4));
+  ASSERT_EQ("DESC8", signals.GetSignalNumberDescription(8));
+  ASSERT_EQ("DESC16", signals.GetSignalNumberDescription(16));
+
+  // Unknown signal number.
+  ASSERT_EQ("", signals.GetSignalNumberDescription(100));
+}
+
 TEST(UnixSignalsTest, VersionChange) {
   TestSignals signals;
 

From d41628941743b778432e30d93f25028ffb375fbc Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Mon, 24 Nov 2025 14:59:19 +0300
Subject: [PATCH 37/38] [TableGen] Eliminate the dependency on SDNode
 definition order (#168745)

Fix the dependency of `CodeGenDAGPatterns::ParseDefaultOperands()` on
the particular order of SDNode definitions. Implicit usage of the first
definition as a placeholder makes `llvm-tblgen -gen-dag-isel` fail if
that SDNode is not usable as an output pattern operator and an instance
of `OperandWithDefaultOps` is used in a pattern.

Presently, each `OperandWithDefaultOps` record is processed by
constructing an instance of TreePattern from its `DefaultOps` argument
that has the form `(ops ...)`. Even though the result of processing the
root operator of that DAG is not inspected by `ParseDefaultOperands()`
function itself, that operator has to be supported by the underlying
`TreePattern::ParseTreePattern()` function. For that reason, a temporary
DAG is created by replacing the root operator of `DefaultOps` argument
with the first SDNode defined, which is usually `def imm : ...` defined
in `TargetSelectionDAG.td` file.

This results in misleading errors being reported when implementing new
`SDNode` types, if the new definition happens to be added before the
`def imm : ...` line. The error is reported by several test cases
executed by `check-llvm` target, as well as by the regular build, if one
of the enabled targets inherit one of its operand types from
`OperandWithDefaultOps`:

    OptionalIntOperand: ../llvm/test/TableGen/DAGDefaultOps.td:28:5: error: In OptionalIntOperand: Cannot use 'unexpected_node' in an output pattern!
    def OptionalIntOperand: OperandWithDefaultOps<i32, (ops (i32 0))>;

This commit implements a dedicated constructor of `TreePattern` to be
used if the caller does not care about the particular root operator of
the pattern being processed.
---
 .../TableGen/Common/CodeGenDAGPatterns.cpp    | 33 +++++++++++++------
 .../TableGen/Common/CodeGenDAGPatterns.h      |  6 ++++
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 2bea0ca2bfabb..8251c8983cc80 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -2922,6 +2922,14 @@ TreePattern::TreePattern(const Record *TheRec, const DagInit *Pat, bool isInput,
   Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
+TreePattern::TreePattern(const Record *TheRec, ArrayRef<const Init *> Args,
+                         ArrayRef<const StringInit *> ArgNames, bool isInput,
+                         CodeGenDAGPatterns &cdp)
+    : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
+      Infer(*this) {
+  Trees.push_back(ParseRootlessTreePattern(Args, ArgNames));
+}
+
 TreePattern::TreePattern(const Record *TheRec, TreePatternNodePtr Pat,
                          bool isInput, CodeGenDAGPatterns &cdp)
     : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
@@ -2950,6 +2958,19 @@ void TreePattern::ComputeNamedNodes(TreePatternNode &N) {
     ComputeNamedNodes(Child);
 }
 
+TreePatternNodePtr
+TreePattern::ParseRootlessTreePattern(ArrayRef<const Init *> Args,
+                                      ArrayRef<const StringInit *> ArgNames) {
+  std::vector<TreePatternNodePtr> Children;
+
+  for (auto [Arg, ArgName] : llvm::zip_equal(Args, ArgNames)) {
+    StringRef NameStr = ArgName ? ArgName->getValue() : "";
+    Children.push_back(ParseTreePattern(Arg, NameStr));
+  }
+
+  return makeIntrusiveRefCnt<TreePatternNode>(nullptr, std::move(Children), 1);
+}
+
 TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit,
                                                  StringRef OpName) {
   RecordKeeper &RK = TheInit->getRecordKeeper();
@@ -3487,20 +3508,12 @@ void CodeGenDAGPatterns::ParseDefaultOperands() {
   ArrayRef<const Record *> DefaultOps =
       Records.getAllDerivedDefinitions("OperandWithDefaultOps");
 
-  // Find some SDNode.
-  assert(!SDNodes.empty() && "No SDNodes parsed?");
-  const Init *SomeSDNode = SDNodes.begin()->first->getDefInit();
-
   for (unsigned i = 0, e = DefaultOps.size(); i != e; ++i) {
     const DagInit *DefaultInfo = DefaultOps[i]->getValueAsDag("DefaultOps");
 
-    // Clone the DefaultInfo dag node, changing the operator from 'ops' to
-    // SomeSDnode so that we can parse this.
-    const DagInit *DI = DagInit::get(SomeSDNode, DefaultInfo->getArgs(),
-                                     DefaultInfo->getArgNames());
-
     // Create a TreePattern to parse this.
-    TreePattern P(DefaultOps[i], DI, false, *this);
+    TreePattern P(DefaultOps[i], DefaultInfo->getArgs(),
+                  DefaultInfo->getArgNames(), false, *this);
     assert(P.getNumTrees() == 1 && "This ctor can only produce one tree!");
 
     // Copy the operands over into a DAGDefaultOperand.
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 633327e2f74e5..9a67933013c1c 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -917,6 +917,9 @@ class TreePattern {
               CodeGenDAGPatterns &ise);
   TreePattern(const Record *TheRec, const DagInit *Pat, bool isInput,
               CodeGenDAGPatterns &ise);
+  TreePattern(const Record *TheRec, ArrayRef<const Init *> Args,
+              ArrayRef<const StringInit *> ArgNames, bool isInput,
+              CodeGenDAGPatterns &ise);
   TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput,
               CodeGenDAGPatterns &ise);
 
@@ -981,6 +984,9 @@ class TreePattern {
 
 private:
   TreePatternNodePtr ParseTreePattern(const Init *DI, StringRef OpName);
+  TreePatternNodePtr
+  ParseRootlessTreePattern(ArrayRef<const Init *> Args,
+                           ArrayRef<const StringInit *> ArgNames);
   void ComputeNamedNodes();
   void ComputeNamedNodes(TreePatternNode &N);
 };

From d90bc3bc609d3ef2254e85cfcd435a99eb2b019b Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Mon, 24 Nov 2025 13:01:57 +0100
Subject: [PATCH 38/38] [mlir][XeGPU][VectorToXeGPU] Use 'xegpu.load' to lower
 1D 'vector.transfer_read' for PVC & BMG (#168910)

The PR changes the `TransferReadLowering` to always use `xegpu.load`
(and not `xegpu.load_nd`) for 1D cases as it has more developed
interface (e.g. layouts capabilites).

Signed-off-by: dchigarev <dmitry.chigarev@intel.com>
---
 .../VectorToXeGPU/VectorToXeGPU.cpp           |  7 +-
 .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 82 ++++++++++++++-----
 2 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 1b4d1a42614ea..4358ef07da91d 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -519,8 +519,13 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
       return lowerToScatteredLoadOp(readOp, rewriter);
     }
 
-    // Perform common data transfer checks.
     VectorType vecTy = readOp.getVectorType();
+
+    // Lower using load.gather in 1D case
+    if (vecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim())
+      return lowerToScatteredLoadOp(readOp, rewriter);
+
+    // Perform common data transfer checks.
     if (failed(storeLoadPreconditions(rewriter, readOp, vecTy)))
       return failure();
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c87a5304babfe..8bb272b1fe5fc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -11,14 +11,15 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector
 
 // LOAD-ND-LABEL:  @load_1D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
-// LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[COLLAPSED]]
-// LOAD-ND-SAME:     memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
-// LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
-// LOAD-ND:        return %[[VEC]]
+// LOAD-ND:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
+// LOAD-ND:        %[[STEP:.+]] = vector.step : vector<8xindex>
+// LOAD-ND-COUNT2: arith.muli {{.*}} : index
+// LOAD-ND-COUNT2: arith.addi {{.*}} : index
+// LOAD-ND:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
+// LOAD-ND:        %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
+// LOAD-ND:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// LOAD-ND:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32>
 
 // LOAD-GATHER-LABEL:  @load_1D_vector(
 // LOAD-GATHER-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
@@ -404,7 +405,7 @@ gpu.func @no_load_unsupported_map(%source: memref<16x32x64xf32>,
 
 // -----
 gpu.module @xevm_module {
-gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> {
+gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> {
   %c0 = arith.constant 0.0 : f16
   %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
   %0 = vector.transfer_read %subview[%off2, %off2], %c0
@@ -412,19 +413,23 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
   gpu.return %0 : vector<8xf16>
 }
 
-// LOAD-ND-LABEL:  @load_from_subview(
+// LOAD-ND-LABEL:  @load_from_subview_1D(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
 // LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
-// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[COLLAPSED]]
-// LOAD-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
-// LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16>
-// LOAD-ND:        return %[[VEC]]
-
-// LOAD-GATHER-LABEL:  @load_from_subview(
+// LOAD-ND:        %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// LOAD-ND:        %[[STEP:.+]] = vector.step : vector<8xindex>
+// LOAD-ND:        arith.muli {{.*}} : index
+// LOAD-ND:        arith.addi %[[OFFSET]]{{.*}} : index
+// LOAD-ND:        arith.addi {{.*}} : index
+// LOAD-ND:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
+// LOAD-ND:        %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
+// LOAD-ND:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// LOAD-ND:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16>
+
+// LOAD-GATHER-LABEL:  @load_from_subview_1D(
 // LOAD-GATHER-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-GATHER-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
 // LOAD-GATHER:        %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
@@ -440,3 +445,42 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
 // LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
 // LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16>
 }
+
+// -----
+gpu.module @xevm_module {
+gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8x16xf16> {
+  %c0 = arith.constant 0.0 : f16
+  %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+  %0 = vector.transfer_read %subview[%off2, %off2], %c0
+    {in_bounds = [true, true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8x16xf16>
+  gpu.return %0 : vector<8x16xf16>
+}
+
+// LOAD-ND-LABEL:  @load_from_subview_2D(
+// LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
+// LOAD-ND-SAME:     %[[SUBVIEW]]
+// LOAD-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf16,
+// LOAD-ND-SAME:     boundary_check = false
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16>
+// LOAD-ND:        return %[[VEC]]
+
+// LOAD-GATHER-LABEL:  @load_from_subview_2D(
+// LOAD-GATHER-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// LOAD-GATHER-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-GATHER:        %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1>
+// LOAD-GATHER:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// LOAD-GATHER:        %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// LOAD-GATHER-COUNT2: vector.step
+// LOAD-GATHER-COUNT2: vector.shape_cast
+// LOAD-GATHER-COUNT2: vector.broadcast
+// LOAD-GATHER-COUNT2: arith.muli {{.*}} : index
+// LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
+// LOAD-GATHER:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8x16xindex>
+// LOAD-GATHER:        %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
+// LOAD-GATHER:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// LOAD-GATHER:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16>
+}