From 996639d6ebb86ff15a8c99b67f1c2e2117636ae7 Mon Sep 17 00:00:00 2001
From: Veera <32646674+veera-sivarajan@users.noreply.github.com>
Date: Sun, 9 Nov 2025 21:08:22 -0800
Subject: [PATCH 01/24] [MLIR][BufferResultsToOutParamsPass] Add Option to
 Modify Public Function's Signature (#167248)

Since https://github.com/llvm/llvm-project/pull/162441,
`buffer-results-to-out-params` transforms `private` functions only.

But, as mentioned in
https://github.com/llvm/llvm-project/pull/162441#issuecomment-3404195242,
this is a breaking change for pipelines handling C code. Our pipeline
@EfficientComputer is also affected by this breaking change.

Therefore, this PR adds an opt-in flag to allow `public` functions to be
transformed by `BufferResultsToOutParamsPass`.
---
 .../Dialect/Bufferization/Transforms/Passes.h |  3 ++
 .../Bufferization/Transforms/Passes.td        |  3 ++
 .../Transforms/BufferResultsToOutParams.cpp   | 10 ++++-
 ...to-out-params-modify-public-functions.mlir | 40 +++++++++++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index 67ac487d8226d..ea158914e445b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -171,6 +171,9 @@ struct BufferResultsToOutParamsOpts {
   /// If true, the pass eliminates the memref.alloc and memcpy if the returned
   /// memref is allocated in the current function and has dynamic shape.
   bool hoistDynamicAllocs = false;
+
+  /// If true, the pass modifies the function signatures of public functions.
+  bool modifyPublicFunctions = false;
 };
 
 /// Replace buffers that are returned from a function with an out parameter.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index cad44cb15f479..1eb692586bcfc 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -258,6 +258,9 @@ def BufferResultsToOutParamsPass
               /*default=*/"false", "Hoist static allocations to call sites.">,
        Option<"hoistDynamicAllocs", "hoist-dynamic-allocs", "bool",
               /*default=*/"false", "Hoist dynamic allocations to call sites.">,
+       Option<"modifyPublicFunctions", "modify-public-functions", "bool",
+              /*default=*/"false", "Modify function signatures of public "
+              "functions.">,
   ];
   let dependentDialects = ["memref::MemRefDialect"];
 }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index b9ee0a4d401f3..d0742ec27ed60 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -217,7 +217,9 @@ updateCalls(ModuleOp module, const AllocDynamicSizesMap &map,
     }
     if (!options.filterFn(&callee))
       return;
-    if (callee.isExternal() || callee.isPublic())
+    if (callee.isPublic() && !options.modifyPublicFunctions)
+      return;
+    if (callee.isExternal())
       return;
 
     SmallVector<Value, 6> replaceWithNewCallResults;
@@ -295,7 +297,9 @@ LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
   // function.
   AllocDynamicSizesMap map;
   for (auto func : module.getOps<func::FuncOp>()) {
-    if (func.isExternal() || func.isPublic())
+    if (func.isPublic() && !options.modifyPublicFunctions)
+      continue;
+    if (func.isExternal())
       continue;
     if (!options.filterFn(&func))
       continue;
@@ -326,6 +330,8 @@ struct BufferResultsToOutParamsPass
       options.hoistStaticAllocs = true;
     if (hoistDynamicAllocs)
       options.hoistDynamicAllocs = true;
+    if (modifyPublicFunctions)
+      options.modifyPublicFunctions = true;
 
     if (failed(bufferization::promoteBufferResultsToOutParams(getOperation(),
                                                               options)))
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir b/mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir
new file mode 100644
index 0000000000000..c99bde3f34986
--- /dev/null
+++ b/mlir/test/Transforms/buffer-results-to-out-params-modify-public-functions.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt -p 'builtin.module(buffer-results-to-out-params{modify-public-functions})' %s | FileCheck %s
+
+// Test if `public` functions' return values are transformed into out parameters
+// when `buffer-results-to-out-params` is invoked with `modifyPublicFunctions`.
+
+// CHECK-LABEL:   func.func @basic(
+// CHECK-SAME:                     %[[ARG0:.*]]: memref<f32>) {
+// CHECK:           %[[VAL_0:.*]] = "test.source"() : () -> memref<f32>
+// CHECK:           memref.copy %[[VAL_0]], %[[ARG0]] : memref<f32> to memref<f32>
+// CHECK:           return
+// CHECK:         }
+func.func @basic() -> (memref<f32>) {
+  %0 = "test.source"() : () -> (memref<f32>)
+  return %0 : memref<f32>
+}
+
+// CHECK-LABEL:   func.func @presence_of_existing_arguments(
+// CHECK-SAME:      %[[ARG0:.*]]: memref<1xf32>,
+// CHECK-SAME:      %[[ARG1:.*]]: memref<2xf32>) {
+// CHECK:           %[[VAL_0:.*]] = "test.source"() : () -> memref<2xf32>
+// CHECK:           memref.copy %[[VAL_0]], %[[ARG1]] : memref<2xf32> to memref<2xf32>
+// CHECK:           return
+// CHECK:         }
+func.func @presence_of_existing_arguments(%arg0: memref<1xf32>) -> (memref<2xf32>) {
+  %0 = "test.source"() : () -> (memref<2xf32>)
+  return %0 : memref<2xf32>
+}
+
+// CHECK-LABEL:   func.func @multiple_results(
+// CHECK-SAME:      %[[ARG0:.*]]: memref<1xf32>,
+// CHECK-SAME:      %[[ARG1:.*]]: memref<2xf32>) {
+// CHECK:           %[[VAL_0:.*]]:2 = "test.source"() : () -> (memref<1xf32>, memref<2xf32>)
+// CHECK:           memref.copy %[[VAL_0]]#0, %[[ARG0]] : memref<1xf32> to memref<1xf32>
+// CHECK:           memref.copy %[[VAL_0]]#1, %[[ARG1]] : memref<2xf32> to memref<2xf32>
+// CHECK:           return
+// CHECK:         }
+func.func @multiple_results() -> (memref<1xf32>, memref<2xf32>) {
+  %0, %1 = "test.source"() : () -> (memref<1xf32>, memref<2xf32>)
+  return %0, %1 : memref<1xf32>, memref<2xf32>
+}

From a8a0ffba739d247e24faaf612ac8f2d8faf1de3c Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 10 Nov 2025 08:17:08 +0100
Subject: [PATCH 02/24] [clang][bytecode] Check source pointer for bitcast
 validity (#166907)

Unfortunately this is more dynamic than anticipated.

Fixes https://github.com/llvm/llvm-project/issues/165006
---
 clang/lib/AST/ByteCode/Compiler.cpp | 22 ++++----------
 clang/lib/AST/ByteCode/Compiler.h   |  2 --
 clang/lib/AST/ByteCode/Interp.h     | 46 +++++++++++++++++++++++++++++
 clang/lib/AST/ByteCode/Opcodes.td   |  2 +-
 clang/test/AST/ByteCode/cxx11.cpp   | 11 +++++++
 clang/test/AST/ByteCode/invalid.cpp | 18 +++++++++++
 6 files changed, 81 insertions(+), 20 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 20836f663cdf8..f68422c6eb01d 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -208,19 +208,6 @@ template <class Emitter> class LocOverrideScope final {
 } // namespace interp
 } // namespace clang
 
-template <class Emitter>
-bool Compiler<Emitter>::isValidBitCast(const CastExpr *E) {
-  QualType FromTy = E->getSubExpr()->getType()->getPointeeType();
-  QualType ToTy = E->getType()->getPointeeType();
-
-  if (classify(FromTy) == classify(ToTy))
-    return true;
-
-  if (FromTy->isVoidType() || ToTy->isVoidType())
-    return true;
-  return false;
-}
-
 template <class Emitter>
 bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
   const Expr *SubExpr = CE->getSubExpr();
@@ -506,12 +493,9 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (!FromT || !ToT)
       return false;
 
-    if (!this->isValidBitCast(CE) &&
-        !this->emitInvalidCast(CastKind::ReinterpretLike, /*Fatal=*/false, CE))
-      return false;
-
     assert(isPtrType(*FromT));
     assert(isPtrType(*ToT));
+    bool SrcIsVoidPtr = SubExprTy->isVoidPointerType();
     if (FromT == ToT) {
       if (CE->getType()->isVoidPointerType() &&
           !SubExprTy->isFunctionPointerType()) {
@@ -520,6 +504,10 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
 
       if (!this->visit(SubExpr))
         return false;
+      if (!this->emitCheckBitCast(CETy->getPointeeType().getTypePtr(),
+                                  SrcIsVoidPtr, CE))
+        return false;
+
       if (CE->getType()->isFunctionPointerType() ||
           SubExprTy->isFunctionPointerType()) {
         return this->emitFnPtrCast(CE);
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index fac0a7f4e1886..5c46f75af4da3 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -425,8 +425,6 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
 
   bool refersToUnion(const Expr *E);
 
-  bool isValidBitCast(const CastExpr *E);
-
 protected:
   /// Variable to storage mapping.
   llvm::DenseMap<const ValueDecl *, Scope::Local> Locals;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index c16408cef1fde..cbd60c9f2b37c 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -3290,6 +3290,52 @@ inline bool SideEffect(InterpState &S, CodePtr OpPC) {
   return S.noteSideEffect();
 }
 
+inline bool CheckBitCast(InterpState &S, CodePtr OpPC, const Type *TargetType,
+                         bool SrcIsVoidPtr) {
+  const auto &Ptr = S.Stk.peek<Pointer>();
+  if (Ptr.isZero())
+    return true;
+  if (!Ptr.isBlockPointer())
+    return true;
+
+  if (TargetType->isIntegerType())
+    return true;
+
+  if (SrcIsVoidPtr && S.getLangOpts().CPlusPlus) {
+    bool HasValidResult = !Ptr.isZero();
+
+    if (HasValidResult) {
+      if (S.getStdAllocatorCaller("allocate"))
+        return true;
+
+      const auto &E = cast<CastExpr>(S.Current->getExpr(OpPC));
+      if (S.getLangOpts().CPlusPlus26 &&
+          S.getASTContext().hasSimilarType(Ptr.getType(),
+                                           QualType(TargetType, 0)))
+        return true;
+
+      S.CCEDiag(E, diag::note_constexpr_invalid_void_star_cast)
+          << E->getSubExpr()->getType() << S.getLangOpts().CPlusPlus26
+          << Ptr.getType().getCanonicalType() << E->getType()->getPointeeType();
+    } else if (!S.getLangOpts().CPlusPlus26) {
+      const SourceInfo &E = S.Current->getSource(OpPC);
+      S.CCEDiag(E, diag::note_constexpr_invalid_cast)
+          << diag::ConstexprInvalidCastKind::CastFrom << "'void *'"
+          << S.Current->getRange(OpPC);
+    }
+  }
+
+  QualType PtrType = Ptr.getType();
+  if (PtrType->isRecordType() &&
+      PtrType->getAsRecordDecl() != TargetType->getAsRecordDecl()) {
+    S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_invalid_cast)
+        << diag::ConstexprInvalidCastKind::ThisConversionOrReinterpret
+        << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
+    return false;
+  }
+  return true;
+}
+
 /// Same here, but only for casts.
 inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
                         bool Fatal) {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index a2eaa61ea4306..1785fcf4a7b20 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -422,8 +422,8 @@ def CheckLiteralType : Opcode {
 }
 
 def CheckArraySize : Opcode { let Args = [ArgUint64]; }
-
 def CheckFunctionDecl : Opcode { let Args = [ArgFunctionDecl]; }
+def CheckBitCast : Opcode { let Args = [ArgTypePtr, ArgBool]; }
 
 // [] -> [Value]
 def GetGlobal : AccessOpcode;
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 753e51dfbfc1c..95615350f5142 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -387,3 +387,14 @@ struct Counter {
 // Passing an lvalue by value makes a non-elidable copy.
 constexpr int PassByValue(Counter c) { return c.copies; }
 static_assert(PassByValue(Counter(0)) == 0, "expect no copies");
+
+namespace PointerCast {
+  /// The two interpreters disagree here.
+  struct S { int x, y; } s;
+  constexpr S* sptr = &s;
+  struct U {};
+  struct Str {
+    int e : (Str*)(sptr) == (Str*)(sptr); // expected-error {{not an integral constant expression}} \
+                                          // expected-note {{cast that performs the conversions of a reinterpret_cast}}
+  };
+}
diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp
index 1f2d6bc1d48eb..115c8663079a1 100644
--- a/clang/test/AST/ByteCode/invalid.cpp
+++ b/clang/test/AST/ByteCode/invalid.cpp
@@ -88,4 +88,22 @@ namespace InvalidBitCast {
                              // both-note {{in call to}}
 
 
+  struct sockaddr
+  {
+    char sa_data[8];
+  };
+  struct in_addr
+  {
+    unsigned int s_addr;
+  };
+  struct sockaddr_in
+  {
+    unsigned short int sin_port;
+    struct in_addr sin_addr;
+  };
+  /// Bitcast from sockaddr to sockaddr_in. Used to crash.
+  unsigned int get_addr(sockaddr addr) {
+      return ((sockaddr_in *)&addr)->sin_addr.s_addr;
+  }
+
 }

From d10a85167a26e9489f9daf20acc0092d55687b15 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 10 Nov 2025 08:07:16 +0000
Subject: [PATCH 03/24] [WebAssembly] Implement more of getCastInstrCost
 (#164612)

Fill out more information for sign and zero extend and add some truncate
information; however, the primary change is to int/fp conversions. In
particular, fp to (narrow) int appears to be relatively expensive.
---
 .../WebAssemblyTargetTransformInfo.cpp        |  66 ++++-
 .../CodeGen/WebAssembly/memory-interleave.ll  | 278 +-----------------
 .../WebAssembly/memory-interleave.ll          |  82 +++---
 3 files changed, 112 insertions(+), 314 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 92a9812df2127..70f7b889551a4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -119,18 +119,82 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
     }
   }
 
-  // extend_low
   static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
+      // extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      // 2 x extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+      // extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+      {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+      {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+      {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+      {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+      {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+      // 2x extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+      {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+      {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      // shuffle
+      {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
+      {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
+      {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
+      {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
+      // narrow, and
+      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
+      {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
+      // narrow, 2x and
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
+      // 3x narrow, 4x and
+      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
+      // 7x narrow, 8x and
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
+      // convert_i32x4
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+      // extend_low, convert
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+      // extend_low x 2, convert
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+      // several shuffles
+      {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
+      {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      /// trunc_sat, const, and, 3x narrow
+      {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
+      {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6},
+      {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6},
+      {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6},
+      /// trunc_sat, const, and, narrow
+      {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4},
+      {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4},
+      {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4},
+      {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4},
+      // 2x trunc_sat, const, 2x and, 3x narrow
+      {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8},
+      {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8},
+      // 2x trunc_sat, const, 2x and, narrow
+      {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6},
+      {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6},
   };
 
   if (const auto *Entry =
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 404db23ba7329..5d58ae223da6f 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1720,28 +1720,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: two_floats_two_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1774,28 +1753,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: two_floats_two_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -2347,64 +2305,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2453,64 +2354,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
@@ -2757,62 +2601,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_shorts_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	65535, 65535, 65535, 65535
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-; CHECK: v128.store
-; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2861,62 +2650,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_shorts_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	65535, 65535, 65535, 65535
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-; CHECK: v128.store
-; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index b26e9cf55ddbf..718e03cfa0c67 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1231,7 +1231,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 8 costs: 10.
+; CHECK: LV: Vector loop of width 8 costs: 11.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %48
@@ -1442,8 +1442,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1484,8 +1484,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1526,9 +1526,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1566,9 +1566,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1608,8 +1608,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1652,8 +1652,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1696,9 +1696,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1738,9 +1738,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1883,8 +1883,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1943,8 +1943,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2004,9 +2004,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2061,9 +2061,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
@@ -2119,8 +2119,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2181,8 +2181,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2243,9 +2243,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2301,9 +2301,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0

From f6138015ef17b0c522a1b60299659ef1587bb8d0 Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813@gmail.com>
Date: Mon, 10 Nov 2025 16:09:57 +0800
Subject: [PATCH 04/24] [RISCV][llvm] Support Smpmpmt version 0.6 (#166322)

spec:
https://github.com/riscv/riscv-isa-manual/blob/smpmpmt/src/smpmpmt.adoc

Co-Authored-by: Jesse Huang <jesse.huang@sifive.com>
---
 clang/test/Driver/print-supported-extensions-riscv.c | 1 +
 clang/test/Preprocessor/riscv-target-features.c      | 9 +++++++++
 llvm/docs/RISCVUsage.rst                             | 3 +++
 llvm/lib/Target/RISCV/RISCVFeatures.td               | 3 +++
 llvm/test/CodeGen/RISCV/attributes.ll                | 4 ++++
 llvm/test/CodeGen/RISCV/features-info.ll             | 1 +
 llvm/test/MC/RISCV/attribute-arch.s                  | 3 +++
 llvm/unittests/TargetParser/RISCVISAInfoTest.cpp     | 1 +
 8 files changed, 25 insertions(+)

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index cb812736786a9..681c912bd1612 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -227,6 +227,7 @@
 // CHECK-NEXT:     zvfofp8min           0.2       'Zvfofp8min' (Vector OFP8 Converts)
 // CHECK-NEXT:     zvkgs                0.7       'Zvkgs' (Vector-Scalar GCM instructions for Cryptography)
 // CHECK-NEXT:     zvqdotq              0.0       'Zvqdotq' (Vector quad widening 4D Dot Product)
+// CHECK-NEXT:     smpmpmt              0.6       'Smpmpmt' (PMP-based Memory Types Extension)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
 // CHECK-NEXT:     xqccmp               0.3       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
 // CHECK-NEXT:     xqcia                0.7       'Xqcia' (Qualcomm uC Arithmetic Extension)
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 77731a9776be8..56c738bc007fb 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -40,6 +40,7 @@
 // CHECK-NOT: __riscv_smepmp {{.*$}}
 // CHECK-NOT: __riscv_smmpm{{.*$}}
 // CHECK-NOT: __riscv_smnpm{{.*$}}
+// CHECK-NOT: __riscv_smpmpmt {{.*$}}
 // CHECK-NOT: __riscv_smrnmi {{.*$}}
 // CHECK-NOT: __riscv_smstateen {{.*$}}
 // CHECK-NOT: __riscv_ssaia {{.*$}}
@@ -1333,6 +1334,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMEPMP-EXT %s
 // CHECK-SMEPMP-EXT: __riscv_smepmp  1000000{{$}}
 
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32ismpmpmt0p6 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-SMPMPMT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64ismpmpmt0p6 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-SMPMPMT %s
+// CHECK-SMPMPMT: __riscv_smpmpmt  6000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32ismrnmi1p0 -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-SMRNMI-EXT %s
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index d03f383a92b3b..a21f03d389444 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -351,6 +351,9 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zvqdotq``
   LLVM implements the `0.0.1 draft specification <https://github.com/riscv/riscv-dot-product/releases/tag/v0.0.1>`__.
 
+``experimental-smpmpmt``
+  LLVM implements the `0.6 draft specification <https://github.com/riscv/riscv-isa-manual/blob/smpmpmt/src/smpmpmt.adoc>`__.
+
 To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using.  To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`.  Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`.
 
 Vendor Extensions
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5b72334f58d45..0b964c4808d8a 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -956,6 +956,9 @@ def FeatureStdExtSsdbltrp
 def FeatureStdExtSmepmp
     : RISCVExtension<1, 0, "Enhanced Physical Memory Protection">;
 
+def FeatureStdExtSmpmpmt
+    : RISCVExperimentalExtension<0, 6, "PMP-based Memory Types Extension">;
+
 def FeatureStdExtSmrnmi
     : RISCVExtension<1, 0, "Resumable Non-Maskable Interrupts">;
 def HasStdExtSmrnmi : Predicate<"Subtarget->hasStdExtSmrnmi()">,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 22c2d8102b5ca..f26d4f09c92fb 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -125,6 +125,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCDELEG %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCNTRPMF %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV32SMPMPMT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV32SMRNMI %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFA %s
@@ -275,6 +276,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCDELEG %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCNTRPMF %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV64SMPMPMT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV64SMRNMI %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFA %s
@@ -439,6 +441,7 @@
 ; RV32SMCDELEG: .attribute 5, "rv32i2p1_smcdeleg1p0"
 ; RV32SMCNTRPMF: .attribute 5, "rv32i2p1_smcntrpmf1p0"
 ; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0"
+; RV32SMPMPMT: .attribute 5, "rv32i2p1_smpmpmt0p6"
 ; RV32SMRNMI: .attribute 5, "rv32i2p1_smrnmi1p0"
 ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV32ZVFBFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0"
@@ -587,6 +590,7 @@
 ; RV64SMCDELEG: .attribute 5, "rv64i2p1_smcdeleg1p0"
 ; RV64SMCNTRPMF: .attribute 5, "rv64i2p1_smcntrpmf1p0"
 ; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0"
+; RV64SMPMPMT: .attribute 5, "rv64i2p1_smpmpmt0p6"
 ; RV64SMRNMI: .attribute 5, "rv64i2p1_smrnmi1p0"
 ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV64ZVFBFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index cf44af608542c..3d9906fdcbeb3 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   experimental                     - Experimental intrinsics.
 ; CHECK-NEXT:   experimental-p                   - 'P' ('Base P' (Packed SIMD)).
 ; CHECK-NEXT:   experimental-rvm23u32            - RISC-V experimental-rvm23u32 profile.
+; CHECK-NEXT:   experimental-smpmpmt             - 'Smpmpmt' (PMP-based Memory Types Extension).
 ; CHECK-NEXT:   experimental-svukte              - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses).
 ; CHECK-NEXT:   experimental-xqccmp              - 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves).
 ; CHECK-NEXT:   experimental-xqcia               - 'Xqcia' (Qualcomm uC Arithmetic Extension).
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 111616df254d3..e41c9eac982a7 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -348,6 +348,9 @@
 .attribute arch, "rv32i_smepmp1p0"
 # CHECK: attribute      5, "rv32i2p1_smepmp1p0"
 
+.attribute arch, "rv32i_smpmpmt0p6"
+# CHECK: attribute      5, "rv32i2p1_smpmpmt0p6"
+
 .attribute arch, "rv32i_smrnmi1p0"
 # CHECK: attribute      5, "rv32i2p1_smrnmi1p0"
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index bfc127530570d..c55cd94048cc5 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1204,6 +1204,7 @@ Experimental extensions
     zvfofp8min           0.2
     zvkgs                0.7
     zvqdotq              0.0
+    smpmpmt              0.6
     svukte               0.3
     xqccmp               0.3
     xqcia                0.7

From 6408703de5a523e331ee47bbb6bea5a13b1a2758 Mon Sep 17 00:00:00 2001
From: Karthika Devi C <quic_kartc@quicinc.com>
Date: Mon, 10 Nov 2025 13:48:15 +0530
Subject: [PATCH 05/24] [Polly] Retain vectorization for fallback loop when RTC
 is unsatisfiable (#165525)

When Polly generates a false runtime condition (RTC), the associated
Polly generated loop is never executed and is eventually eliminated. As
a result, the fallback loop becomes the default execution path.
Disabling vectorization for this fallback loop will be
counterproductive. This patch ensures that vectorization is only
disabled when the RTC is not false (no Codegen failure).
---
 polly/lib/CodeGen/CodeGeneration.cpp          | 24 ++++++++++------
 .../CodeGen/Metadata/fallback_vec_annotate.ll | 28 +++++++++++++++++++
 2 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 polly/test/CodeGen/Metadata/fallback_vec_annotate.ll

diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
index 2d8b393cc039c..062cdfbcfe3b5 100644
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@@ -235,15 +235,6 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
   NodeBuilder.allocateNewArrays(StartExitBlocks);
   Annotator.buildAliasScopes(S);
 
-  // The code below annotates the "llvm.loop.vectorize.enable" to false
-  // for the code flow taken when RTCs fail. Because we don't want the
-  // Loop Vectorizer to come in later and vectorize the original fall back
-  // loop when Polly is enabled.
-  for (Loop *L : LI.getLoopsInPreorder()) {
-    if (S.contains(L))
-      addStringMetadataToLoop(L, "llvm.loop.vectorize.enable", 0);
-  }
-
   if (PerfMonitoring) {
     PerfMonitor P(S, EnteringBB->getParent()->getParent());
     P.initialize();
@@ -285,6 +276,21 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
 
     Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
 
+    auto *CI = dyn_cast<ConstantInt>(RTC);
+    // The code below annotates the "llvm.loop.vectorize.enable" to false
+    // for the code flow taken when RTCs fail. Because we don't want the
+    // Loop Vectorizer to come in later and vectorize the original fall back
+    // loop when Polly is enabled. This avoids loop versioning on fallback
+    // loop by Loop Vectorizer. Don't do this when Polly's RTC value is
+    // false (due to code generation failure), as we are left with only one
+    // version of Loop.
+    if (!(CI && CI->isZero())) {
+      for (Loop *L : LI.getLoopsInPreorder()) {
+        if (S.contains(L))
+          addStringMetadataToLoop(L, "llvm.loop.vectorize.enable", 0);
+      }
+    }
+
     // Explicitly set the insert point to the end of the block to avoid that a
     // split at the builder's current
     // insert position would move the malloc calls to the wrong BasicBlock.
diff --git a/polly/test/CodeGen/Metadata/fallback_vec_annotate.ll b/polly/test/CodeGen/Metadata/fallback_vec_annotate.ll
new file mode 100644
index 0000000000000..317d30649ab1d
--- /dev/null
+++ b/polly/test/CodeGen/Metadata/fallback_vec_annotate.ll
@@ -0,0 +1,28 @@
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-annotate-metadata-vectorize < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+
+; Verify vectorization is not disabled when RTC of Polly is false
+
+; CHECK: attributes {{.*}} = { "polly-optimized" }
+; CHECK-NOT: {{.*}} = !{!"llvm.loop.vectorize.enable", i32 0}
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android10000"
+
+define void @ham(i64 %arg) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %phi = phi ptr [ %getelementptr4, %bb3 ], [ null, %bb ]
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb1
+  %getelementptr = getelementptr i8, ptr %phi, i64 1
+  store i8 0, ptr %getelementptr, align 1
+  br i1 false, label %bb2, label %bb3
+
+bb3:                                              ; preds = %bb2
+  %getelementptr4 = getelementptr i8, ptr %phi, i64 %arg
+  br label %bb1
+}

From 4b433cbdeec7870fac318222d280dd0294ee34e3 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Mon, 10 Nov 2025 11:26:38 +0300
Subject: [PATCH 06/24] [clang-tidy] Rename 'cert-err60-cpp' to
 'bugprone-exception-copy-constructor-throws' (#164061)

Closes https://github.com/llvm/llvm-project/issues/157299.

---------

Co-authored-by: Victor Chernyakin <chernyakin.victor.j@outlook.com>
---
 .../bugprone/BugproneTidyModule.cpp           |  3 ++
 .../clang-tidy/bugprone/CMakeLists.txt        |  1 +
 .../ExceptionCopyConstructorThrowsCheck.cpp}  | 12 ++++---
 .../ExceptionCopyConstructorThrowsCheck.h}    | 16 +++++-----
 .../clang-tidy/cert/CERTTidyModule.cpp        |  5 +--
 .../clang-tidy/cert/CMakeLists.txt            |  1 -
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 +++
 .../exception-copy-constructor-throws.rst     | 31 +++++++++++++++++++
 .../docs/clang-tidy/checks/cert/err60-cpp.rst |  9 ++++--
 .../docs/clang-tidy/checks/list.rst           |  3 ++
 .../exception-copy-constructor-throws.cpp}    |  4 +--
 11 files changed, 68 insertions(+), 21 deletions(-)
 rename clang-tools-extra/clang-tidy/{cert/ThrownExceptionTypeCheck.cpp => bugprone/ExceptionCopyConstructorThrowsCheck.cpp} (75%)
 rename clang-tools-extra/clang-tidy/{cert/ThrownExceptionTypeCheck.h => bugprone/ExceptionCopyConstructorThrowsCheck.h} (58%)
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
 rename clang-tools-extra/test/clang-tidy/checkers/{cert/throw-exception-type.cpp => bugprone/exception-copy-constructor-throws.cpp} (93%)

diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index baea231f6e060..3ba1532334e4a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -30,6 +30,7 @@
 #include "DynamicStaticInitializersCheck.h"
 #include "EasilySwappableParametersCheck.h"
 #include "EmptyCatchCheck.h"
+#include "ExceptionCopyConstructorThrowsCheck.h"
 #include "ExceptionEscapeCheck.h"
 #include "FloatLoopCounterCheck.h"
 #include "FoldInitTypeCheck.h"
@@ -155,6 +156,8 @@ class BugproneModule : public ClangTidyModule {
     CheckFactories.registerCheck<EasilySwappableParametersCheck>(
         "bugprone-easily-swappable-parameters");
     CheckFactories.registerCheck<EmptyCatchCheck>("bugprone-empty-catch");
+    CheckFactories.registerCheck<ExceptionCopyConstructorThrowsCheck>(
+        "bugprone-exception-copy-constructor-throws");
     CheckFactories.registerCheck<ExceptionEscapeCheck>(
         "bugprone-exception-escape");
     CheckFactories.registerCheck<FloatLoopCounterCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index aacaa61888147..49c467aa5090c 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -26,6 +26,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   DynamicStaticInitializersCheck.cpp
   EasilySwappableParametersCheck.cpp
   EmptyCatchCheck.cpp
+  ExceptionCopyConstructorThrowsCheck.cpp
   ExceptionEscapeCheck.cpp
   FloatLoopCounterCheck.cpp
   FoldInitTypeCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp
similarity index 75%
rename from clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp
index 2225a90aeece1..73658459b8e26 100644
--- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.cpp
@@ -6,15 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ThrownExceptionTypeCheck.h"
+#include "ExceptionCopyConstructorThrowsCheck.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
 using namespace clang::ast_matchers;
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void ThrownExceptionTypeCheck::registerMatchers(MatchFinder *Finder) {
+void ExceptionCopyConstructorThrowsCheck::registerMatchers(
+    MatchFinder *Finder) {
   Finder->addMatcher(
       traverse(
           TK_AsIs,
@@ -25,10 +26,11 @@ void ThrownExceptionTypeCheck::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-void ThrownExceptionTypeCheck::check(const MatchFinder::MatchResult &Result) {
+void ExceptionCopyConstructorThrowsCheck::check(
+    const MatchFinder::MatchResult &Result) {
   const auto *E = Result.Nodes.getNodeAs<Expr>("expr");
   diag(E->getExprLoc(),
        "thrown exception type is not nothrow copy constructible");
 }
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h
similarity index 58%
rename from clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h
index 41a5145209686..f1d7cca0e5bad 100644
--- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionCopyConstructorThrowsCheck.h
@@ -6,20 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Checks whether a thrown object is nothrow copy constructible.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/err60-cpp.html
-class ThrownExceptionTypeCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/exception-copy-constructor-throws.html
+class ExceptionCopyConstructorThrowsCheck : public ClangTidyCheck {
 public:
-  ThrownExceptionTypeCheck(StringRef Name, ClangTidyContext *Context)
+  ExceptionCopyConstructorThrowsCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -28,6 +28,6 @@ class ThrownExceptionTypeCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_THROWNEXCEPTIONTYPECHECK_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_EXCEPTIONCOPYCONSTRUCTORTHROWSCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index 2f6fc4db46545..6dbcecee1e023 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -13,6 +13,7 @@
 #include "../bugprone/CommandProcessorCheck.h"
 #include "../bugprone/CopyConstructorMutatesArgumentCheck.h"
 #include "../bugprone/DefaultOperatorNewOnOveralignedTypeCheck.h"
+#include "../bugprone/ExceptionCopyConstructorThrowsCheck.h"
 #include "../bugprone/FloatLoopCounterCheck.h"
 #include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h"
 #include "../bugprone/RawMemoryCallOnNonTrivialTypeCheck.h"
@@ -41,7 +42,6 @@
 #include "../readability/UppercaseLiteralSuffixCheck.h"
 #include "LimitedRandomnessCheck.h"
 #include "ProperlySeededRandomGeneratorCheck.h"
-#include "ThrownExceptionTypeCheck.h"
 
 namespace {
 
@@ -262,7 +262,8 @@ class CERTModule : public ClangTidyModule {
         "cert-err52-cpp");
     CheckFactories.registerCheck<bugprone::ThrowingStaticInitializationCheck>(
         "cert-err58-cpp");
-    CheckFactories.registerCheck<ThrownExceptionTypeCheck>("cert-err60-cpp");
+    CheckFactories.registerCheck<bugprone::ExceptionCopyConstructorThrowsCheck>(
+        "cert-err60-cpp");
     CheckFactories.registerCheck<misc::ThrowByValueCatchByReferenceCheck>(
         "cert-err61-cpp");
     // MEM
diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
index 5abb47277e78f..81015a02023ba 100644
--- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
@@ -7,7 +7,6 @@ add_clang_library(clangTidyCERTModule STATIC
   CERTTidyModule.cpp
   LimitedRandomnessCheck.cpp
   ProperlySeededRandomGeneratorCheck.cpp
-  ThrownExceptionTypeCheck.cpp
 
   LINK_LIBS
   clangTidy
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 48a2a1f5d39d5..c233301a08f36 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -269,6 +269,10 @@ New check aliases
   <clang-tidy/checks/bugprone/throwing-static-initialization>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-err60-cpp <clang-tidy/checks/cert/err60-cpp>` to
+  :doc:`bugprone-exception-copy-constructor-throws
+  <clang-tidy/checks/bugprone/exception-copy-constructor-throws>`
+
 - Renamed :doc:`cert-flp30-c <clang-tidy/checks/cert/flp30-c>` to
   :doc:`bugprone-float-loop-counter
   <clang-tidy/checks/bugprone/float-loop-counter>`
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
new file mode 100644
index 0000000000000..8c3becf80a541
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-copy-constructor-throws.rst
@@ -0,0 +1,31 @@
+.. title:: clang-tidy - bugprone-exception-copy-constructor-throws
+
+bugprone-exception-copy-constructor-throws
+==========================================
+
+Checks whether a thrown object's copy constructor can throw.
+
+Exception objects are required to be copy constructible in C++. However, an
+exception's copy constructor should not throw to avoid potential issues when
+unwinding the stack. If an exception is thrown during stack unwinding (such
+as from a copy constructor of an exception object), the program will
+terminate via ``std::terminate``.
+
+.. code-block:: c++
+
+  class SomeException {
+  public:
+    SomeException() = default;
+    SomeException(const SomeException&) { /* may throw */ }
+  };
+
+  void f() {
+    throw SomeException();  // warning: thrown exception type's copy constructor can throw
+  }
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rule
+`ERR60-CPP. Exception objects must be nothrow copy constructible
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
\ No newline at end of file
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst
index 9fcb840fc06f8..8d6dd1bf4b9b7 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/err60-cpp.rst
@@ -1,11 +1,14 @@
 .. title:: clang-tidy - cert-err60-cpp
+.. meta::
+   :http-equiv=refresh: 5;URL=../bugprone/exception-copy-constructor-throws.html
 
 cert-err60-cpp
 ==============
 
-This check flags all throw expressions where the exception object is not nothrow
-copy constructible.
+The `cert-err60-cpp` check is an alias, please see
+`bugprone-exception-copy-constructor-throws <../bugprone/exception-copy-constructor-throws.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `ERR60-CPP. Exception objects must be nothrow copy constructible
-<https://www.securecoding.cert.org/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
+<https://wiki.sei.cmu.edu/confluence/display/cplusplus/ERR60-CPP.+Exception+objects+must+be+nothrow+copy+constructible>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index e2875604af72b..3b0ff3ef33365 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -98,6 +98,7 @@ Clang-Tidy Checks
    :doc:`bugprone-dynamic-static-initializers <bugprone/dynamic-static-initializers>`,
    :doc:`bugprone-easily-swappable-parameters <bugprone/easily-swappable-parameters>`,
    :doc:`bugprone-empty-catch <bugprone/empty-catch>`,
+   :doc:`bugprone-exception-copy-constructor-throws <bugprone/exception-copy-constructor-throws>`,
    :doc:`bugprone-exception-escape <bugprone/exception-escape>`,
    :doc:`bugprone-float-loop-counter <bugprone/float-loop-counter>`,
    :doc:`bugprone-fold-init-type <bugprone/fold-init-type>`,
@@ -180,6 +181,7 @@ Clang-Tidy Checks
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
    :doc:`cert-err33-c <cert/err33-c>`,
    :doc:`cert-err60-cpp <cert/err60-cpp>`,
+   :doc:`cert-flp30-c <cert/flp30-c>`,
    :doc:`cert-msc50-cpp <cert/msc50-cpp>`,
    :doc:`cert-msc51-cpp <cert/msc51-cpp>`,
    :doc:`cert-oop58-cpp <cert/oop58-cpp>`,
@@ -449,6 +451,7 @@ Check aliases
    :doc:`cert-err34-c <cert/err34-c>`, :doc:`bugprone-unchecked-string-to-number-conversion <bugprone/unchecked-string-to-number-conversion>`,
    :doc:`cert-err52-cpp <cert/err52-cpp>`, :doc:`modernize-avoid-setjmp-longjmp <modernize/avoid-setjmp-longjmp>`,
    :doc:`cert-err58-cpp <cert/err58-cpp>`, :doc:`bugprone-throwing-static-initialization <bugprone/throwing-static-initialization>`,
+   :doc:`cert-err60-cpp <cert/err60-cpp>`, :doc:`bugprone-exception-copy-constructor-throws <bugprone/exception-copy-constructor-throws>`,
    :doc:`cert-err61-cpp <cert/err61-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
    :doc:`cert-exp42-c <cert/exp42-c>`, :doc:`bugprone-suspicious-memory-comparison <bugprone/suspicious-memory-comparison>`,
    :doc:`cert-fio38-c <cert/fio38-c>`, :doc:`misc-non-copyable-objects <misc/non-copyable-objects>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp
similarity index 93%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp
index 34ca83795c397..7e2d586175c1b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/throw-exception-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-copy-constructor-throws.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++11,c++14 %s cert-err60-cpp %t -- -- -fcxx-exceptions
+// RUN: %check_clang_tidy -std=c++11,c++14 %s bugprone-exception-copy-constructor-throws %t -- -- -fcxx-exceptions
 // FIXME: Split off parts of this test that rely on dynamic exception
 // specifications, and run this test in all language modes.
 // FIXME: Fix the checker to work in C++17 or later mode.
@@ -92,7 +92,7 @@ void f() {
   throw U(); // ok
   throw V(); // ok
   throw W(); // match, noexcept(false)
-  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible [cert-err60-cpp]
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible [bugprone-exception-copy-constructor-throws]
   throw X(); // match, no noexcept clause, nontrivial
   // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: thrown exception type is not nothrow copy constructible
   throw Y(); // ok

From abdb9a0ec57e84ad9f09642ac6c910184aab31bd Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 10 Nov 2025 08:26:46 +0000
Subject: [PATCH 07/24] [gn build] Port 4b433cbdeec7

---
 .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn  | 1 +
 .../gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn      | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index b01cfb9f4c915..de812cd7d5561 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -38,6 +38,7 @@ static_library("bugprone") {
     "DynamicStaticInitializersCheck.cpp",
     "EasilySwappableParametersCheck.cpp",
     "EmptyCatchCheck.cpp",
+    "ExceptionCopyConstructorThrowsCheck.cpp",
     "ExceptionEscapeCheck.cpp",
     "FloatLoopCounterCheck.cpp",
     "FoldInitTypeCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index 18708f68b59c5..65c149b9d9360 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -18,6 +18,5 @@ static_library("cert") {
     "CERTTidyModule.cpp",
     "LimitedRandomnessCheck.cpp",
     "ProperlySeededRandomGeneratorCheck.cpp",
-    "ThrownExceptionTypeCheck.cpp",
   ]
 }

From d0081aa92923cf5ee1d2ec4c6191a492c5a8cf25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Mon, 10 Nov 2025 09:38:15 +0100
Subject: [PATCH 08/24] [NFC][SPIRV] Make the zero-length-array.ll test
 explicit about what is generated (#166910)

This patch doesn't change anything. Just adds more explicit checks to
verify what is generated in this case when an alloca has a zero-sized
array.

I'd expect an `OpRuntimeArray`, but nothing is generated.
---
 llvm/test/CodeGen/SPIRV/zero-length-array.ll | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/SPIRV/zero-length-array.ll b/llvm/test/CodeGen/SPIRV/zero-length-array.ll
index 666176c87adb6..5fd94d25dfd87 100644
--- a/llvm/test/CodeGen/SPIRV/zero-length-array.ll
+++ b/llvm/test/CodeGen/SPIRV/zero-length-array.ll
@@ -1,10 +1,17 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: %[[#type:]] = OpTypeInt 32 0
-; CHECK: %[[#ext:]] = OpConstant %[[#type]] 0
+; Nothing is generated, but compilation doesn't crash.
+; CHECK: OpName %[[#FOO:]] "foo"
+; CHECK: OpName %[[#RTM:]] "reg2mem alloca point"
+; CHECK: %[[#INT:]] = OpTypeInt 32 0
+; CHECK: %[[#RTM]] = OpConstant %[[#INT]] 0
+; CHECK: %[[#FOO]] = OpFunction
+; CHECK-NEXT: = OpLabel
+; CHECK-NEXT: OpReturn
+; CHECK-NEXT: OpFunctionEnd
 
-define spir_func void @_Z3foov() {
+define spir_func void @foo() {
 entry:
   %i = alloca [0 x i32], align 4
   ret void

From eaa889ab1791618eddc6a22d777750ac936b65a1 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Mon, 10 Nov 2025 11:41:21 +0300
Subject: [PATCH 09/24] [clang-tidy] Add fine-graded configuration for
 'bugprone-exception-escape' (#164081)

Need these options to complete
https://github.com/llvm/llvm-project/issues/160825, but I think it's
generally beneficial to fine-tune this check.

---------

Co-authored-by: EugeneZelenko <eugene.zelenko@gmail.com>
Co-authored-by: Victor Chernyakin <chernyakin.victor.j@outlook.com>
---
 .../bugprone/ExceptionEscapeCheck.cpp         | 42 +++++++++++++----
 .../bugprone/ExceptionEscapeCheck.h           |  7 +++
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 +-
 .../checks/bugprone/exception-escape.rst      | 25 ++++++++++
 .../bugprone/exception-escape-options.cpp     | 47 +++++++++++++++++++
 5 files changed, 114 insertions(+), 11 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index 837a86ff8655e..b7de8395ffa05 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -36,13 +36,22 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name,
                                            ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context), RawFunctionsThatShouldNotThrow(Options.get(
                                          "FunctionsThatShouldNotThrow", "")),
-      RawIgnoredExceptions(Options.get("IgnoredExceptions", "")) {
+      RawIgnoredExceptions(Options.get("IgnoredExceptions", "")),
+      RawCheckedSwapFunctions(
+          Options.get("CheckedSwapFunctions", "swap,iter_swap,iter_move")),
+      CheckDestructors(Options.get("CheckDestructors", true)),
+      CheckMoveMemberFunctions(Options.get("CheckMoveMemberFunctions", true)),
+      CheckMain(Options.get("CheckMain", true)),
+      CheckNothrowFunctions(Options.get("CheckNothrowFunctions", true)) {
   llvm::SmallVector<StringRef, 8> FunctionsThatShouldNotThrowVec,
-      IgnoredExceptionsVec;
+      IgnoredExceptionsVec, CheckedSwapFunctionsVec;
   RawFunctionsThatShouldNotThrow.split(FunctionsThatShouldNotThrowVec, ",", -1,
                                        false);
   FunctionsThatShouldNotThrow.insert_range(FunctionsThatShouldNotThrowVec);
 
+  RawCheckedSwapFunctions.split(CheckedSwapFunctionsVec, ",", -1, false);
+  CheckedSwapFunctions.insert_range(CheckedSwapFunctionsVec);
+
   llvm::StringSet<> IgnoredExceptions;
   RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false);
   IgnoredExceptions.insert_range(IgnoredExceptionsVec);
@@ -54,20 +63,33 @@ void ExceptionEscapeCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "FunctionsThatShouldNotThrow",
                 RawFunctionsThatShouldNotThrow);
   Options.store(Opts, "IgnoredExceptions", RawIgnoredExceptions);
+  Options.store(Opts, "CheckedSwapFunctions", RawCheckedSwapFunctions);
+  Options.store(Opts, "CheckDestructors", CheckDestructors);
+  Options.store(Opts, "CheckMoveMemberFunctions", CheckMoveMemberFunctions);
+  Options.store(Opts, "CheckMain", CheckMain);
+  Options.store(Opts, "CheckNothrowFunctions", CheckNothrowFunctions);
 }
 
 void ExceptionEscapeCheck::registerMatchers(MatchFinder *Finder) {
+  auto MatchIf = [](bool Enabled, const auto &Matcher) {
+    ast_matchers::internal::Matcher<FunctionDecl> Nothing = unless(anything());
+    return Enabled ? Matcher : Nothing;
+  };
   Finder->addMatcher(
       functionDecl(
           isDefinition(),
-          anyOf(isNoThrow(),
-                allOf(anyOf(cxxDestructorDecl(),
-                            cxxConstructorDecl(isMoveConstructor()),
-                            cxxMethodDecl(isMoveAssignmentOperator()), isMain(),
-                            allOf(hasAnyName("swap", "iter_swap", "iter_move"),
-                                  hasAtLeastOneParameter())),
-                      unless(isExplicitThrow())),
-                isEnabled(FunctionsThatShouldNotThrow)))
+          anyOf(
+              MatchIf(CheckNothrowFunctions, isNoThrow()),
+              allOf(anyOf(MatchIf(CheckDestructors, cxxDestructorDecl()),
+                          MatchIf(
+                              CheckMoveMemberFunctions,
+                              anyOf(cxxConstructorDecl(isMoveConstructor()),
+                                    cxxMethodDecl(isMoveAssignmentOperator()))),
+                          MatchIf(CheckMain, isMain()),
+                          allOf(isEnabled(CheckedSwapFunctions),
+                                hasAtLeastOneParameter())),
+                    unless(isExplicitThrow())),
+              isEnabled(FunctionsThatShouldNotThrow)))
           .bind("thrower"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
index 31d9e85082c52..c3bf4a4335273 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
@@ -35,8 +35,15 @@ class ExceptionEscapeCheck : public ClangTidyCheck {
 private:
   StringRef RawFunctionsThatShouldNotThrow;
   StringRef RawIgnoredExceptions;
+  StringRef RawCheckedSwapFunctions;
+
+  const bool CheckDestructors;
+  const bool CheckMoveMemberFunctions;
+  const bool CheckMain;
+  const bool CheckNothrowFunctions;
 
   llvm::StringSet<> FunctionsThatShouldNotThrow;
+  llvm::StringSet<> CheckedSwapFunctions;
   utils::ExceptionAnalyzer Tracer;
 };
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c233301a08f36..f3d5b6f43a227 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -306,7 +306,9 @@ Changes in existing checks
   exceptions from captures are now diagnosed, exceptions in the bodies of
   lambdas that aren't actually invoked are not. Additionally, fixed an issue
   where the check wouldn't diagnose throws in arguments to functions or
-  constructors.
+  constructors. Added fine-grained configuration via options 
+  `CheckDestructors`, `CheckMoveMemberFunctions`, `CheckMain`,
+  `CheckedSwapFunctions`, and `CheckNothrowFunctions`.
 
 - Improved :doc:`bugprone-infinite-loop
   <clang-tidy/checks/bugprone/infinite-loop>` check by adding detection for
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
index 182fade7f47a0..7eaa333d5403a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/exception-escape.rst
@@ -35,6 +35,31 @@ WARNING! This check may be expensive on large source files.
 Options
 -------
 
+.. option:: CheckDestructors
+
+   When `true`, destructors are analyzed to not throw exceptions.
+   Default value is `true`.
+
+.. option:: CheckMoveMemberFunctions
+
+   When `true`, move constructors and move assignment operators are analyzed
+   to not throw exceptions. Default value is `true`.
+
+.. option:: CheckMain
+
+   When `true`, the ``main()`` function is analyzed to not throw exceptions.
+   Default value is `true`.
+
+.. option:: CheckNothrowFunctions
+
+   When `true`, functions marked with ``noexcept`` or ``throw()`` exception
+   specifications are analyzed to not throw exceptions. Default value is `true`.
+
+.. option:: CheckedSwapFunctions
+
+   Comma-separated list of swap function names which should not throw exceptions.
+   Default value is `swap,iter_swap,iter_move`.
+
 .. option:: FunctionsThatShouldNotThrow
 
    Comma separated list containing function names which should not throw. An
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp
new file mode 100644
index 0000000000000..48c9bacd1b2e5
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-options.cpp
@@ -0,0 +1,47 @@
+// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-exception-escape %t -- \
+// RUN:     -config="{CheckOptions: { \
+// RUN:         bugprone-exception-escape.CheckDestructors: false, \
+// RUN:         bugprone-exception-escape.CheckMoveMemberFunctions: false, \
+// RUN:         bugprone-exception-escape.CheckMain: false, \
+// RUN:         bugprone-exception-escape.CheckedSwapFunctions: '', \
+// RUN:         bugprone-exception-escape.CheckNothrowFunctions: false \
+// RUN:     }}" \
+// RUN:     -- -fexceptions
+
+// CHECK-MESSAGES-NOT: warning:
+
+struct destructor {
+  ~destructor() {
+    throw 1;
+  }
+};
+
+struct move {
+    move(move&&) { throw 42; }
+    move& operator=(move&&) { throw 42; }
+};
+
+void swap(int&, int&) {
+  throw 1;
+}
+
+void iter_swap(int&, int&) {
+  throw 1;
+}
+
+void iter_move(int&) {
+  throw 1;
+}
+
+void nothrow_func() throw() {
+  throw 1;
+}
+
+void noexcept_func() noexcept {
+  throw 1;
+}
+
+int main() {
+  throw 1;
+  return 0;
+}

From 3637f66b4d819c1cef6a3b58466fc8b1f983cfe0 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang@sifive.com>
Date: Mon, 10 Nov 2025 17:01:43 +0800
Subject: [PATCH 10/24] [RISCV][TTI] Fix crash of non-built-in vector type cost
 quering. (#167258)

For the non-built-in vector type, the RISCV cost model cannot handle
this properly.
So fall back to the BasicTTI for this situation.

Fixes: #166732
---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp |  3 ++-
 llvm/test/Analysis/CostModel/RISCV/cast.ll         | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 332433b4e530b..3d8eb4097604a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1683,7 +1683,8 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
                            SrcLT.second.getSizeInBits()) ||
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
-                           DstLT.second.getSizeInBits()))
+                           DstLT.second.getSizeInBits()) ||
+      SrcLT.first > 1 || DstLT.first > 1)
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 
   // The split cost is handled by the base getCastInstrCost
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index e64bce2d9c9e5..6dacd59f07fde 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -6239,3 +6239,13 @@ define void @legalization_crash() {
   fptoui <192 x float> undef to <192 x i1>
   ret void
 }
+
+; Test that types that need to be split go through BasicTTIImpl.
+define void @BitInt_crash() {
+; ZVE64X-LABEL: 'BitInt_crash'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2043 for instruction: %1 = bitcast <16 x i64> poison to <512 x i2>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  bitcast <16 x i64> poison to <512 x i2>
+  ret void
+}

From 152bda726958c45be709270bc8a5e1fda642f375 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 10 Nov 2025 10:09:14 +0100
Subject: [PATCH 11/24] [libc++] Replace the last uses of __tuple_types with
 __type_list (#167214)

`__tuple_types` is at this point just a `__type_list` with a weird name,
so we can just replace the few places it's still used.
---
 libcxx/include/CMakeLists.txt        |  1 -
 libcxx/include/__tuple/tuple_size.h  |  1 -
 libcxx/include/__tuple/tuple_types.h | 25 -------------------------
 libcxx/include/module.modulemap.in   |  1 -
 libcxx/include/tuple                 | 14 +++++++-------
 5 files changed, 7 insertions(+), 35 deletions(-)
 delete mode 100644 libcxx/include/__tuple/tuple_types.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 46e17b584432e..09d4552664dd7 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -786,7 +786,6 @@ set(files
   __tuple/tuple_like.h
   __tuple/tuple_like_no_subrange.h
   __tuple/tuple_size.h
-  __tuple/tuple_types.h
   __type_traits/add_cv_quals.h
   __type_traits/add_pointer.h
   __type_traits/add_reference.h
diff --git a/libcxx/include/__tuple/tuple_size.h b/libcxx/include/__tuple/tuple_size.h
index 60f2a667a1ba3..719edc0e342c0 100644
--- a/libcxx/include/__tuple/tuple_size.h
+++ b/libcxx/include/__tuple/tuple_size.h
@@ -12,7 +12,6 @@
 #include <__config>
 #include <__cstddef/size_t.h>
 #include <__fwd/tuple.h>
-#include <__tuple/tuple_types.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_const.h>
diff --git a/libcxx/include/__tuple/tuple_types.h b/libcxx/include/__tuple/tuple_types.h
deleted file mode 100644
index 7e1256cf8790e..0000000000000
--- a/libcxx/include/__tuple/tuple_types.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TUPLE_TUPLE_TYPES_H
-#define _LIBCPP___TUPLE_TUPLE_TYPES_H
-
-#include <__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class... _Tp>
-struct __tuple_types {};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TUPLE_TUPLE_TYPES_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index f77c885da5b6a..2266a1d1d4c1c 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -2124,7 +2124,6 @@ module std [system] {
     module tuple_like_no_subrange   { header "__tuple/tuple_like_no_subrange.h" }
     module tuple_like               { header "__tuple/tuple_like.h" }
     module tuple_size               { header "__tuple/tuple_size.h" }
-    module tuple_types              { header "__tuple/tuple_types.h" }
 
     header "tuple"
     export *
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 3c5330dd6e14e..a960b64a71763 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -235,7 +235,6 @@ template <class... Types>
 #  include <__tuple/tuple_element.h>
 #  include <__tuple/tuple_like.h>
 #  include <__tuple/tuple_size.h>
-#  include <__tuple/tuple_types.h>
 #  include <__type_traits/common_reference.h>
 #  include <__type_traits/common_type.h>
 #  include <__type_traits/conditional.h>
@@ -265,6 +264,7 @@ template <class... Types>
 #  include <__type_traits/remove_cv.h>
 #  include <__type_traits/remove_cvref.h>
 #  include <__type_traits/remove_reference.h>
+#  include <__type_traits/type_list.h>
 #  include <__type_traits/unwrap_ref.h>
 #  include <__utility/declval.h>
 #  include <__utility/forward.h>
@@ -571,7 +571,7 @@ __memberwise_copy_assign(_Dest& __dest, _Source const& __source, __index_sequenc
 
 template <class _Dest, class _Source, class... _Up, size_t... _Np>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
-__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __tuple_types<_Up...>, __index_sequence<_Np...>) {
+__memberwise_forward_assign(_Dest& __dest, _Source&& __source, __type_list<_Up...>, __index_sequence<_Np...>) {
   std::__swallow(((std::get<_Np>(__dest) = std::forward<_Up>(std::get<_Np>(__source))), void(), 0)...);
 }
 
@@ -876,7 +876,7 @@ public:
     requires(_And<is_assignable<const _Tp&, _Tp>...>::value)
   {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__tuple), __type_list<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 #    endif // _LIBCPP_STD_VER >= 23
@@ -885,7 +885,7 @@ public:
   operator=(_If<_And<is_move_assignable<_Tp>...>::value, tuple, __nat>&& __tuple) noexcept(
       _And<is_nothrow_move_assignable<_Tp>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__tuple), __type_list<_Tp...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -905,7 +905,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(tuple<_Up...>&& __tuple) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__tuple), __tuple_types<_Up...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__tuple), __type_list<_Up...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 
@@ -922,7 +922,7 @@ public:
             enable_if_t< _And<_BoolConstant<sizeof...(_Tp) == sizeof...(_UTypes)>,
                               is_assignable<const _Tp&, _UTypes>...>::value>* = nullptr>
   _LIBCPP_HIDE_FROM_ABI constexpr const tuple& operator=(tuple<_UTypes...>&& __u) const {
-    std::__memberwise_forward_assign(*this, __u, __tuple_types<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>());
+    std::__memberwise_forward_assign(*this, __u, __type_list<_UTypes...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 #    endif // _LIBCPP_STD_VER >= 23
@@ -1000,7 +1000,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 tuple&
   operator=(array<_Up, _Np>&& __array) noexcept(_And<is_nothrow_assignable<_Tp&, _Up>...>::value) {
     std::__memberwise_forward_assign(
-        *this, std::move(__array), __tuple_types<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>());
+        *this, std::move(__array), __type_list<_If<true, _Up, _Tp>...>(), __make_index_sequence<sizeof...(_Tp)>());
     return *this;
   }
 

From 6ef174c44c9bd2847aef06f1dcdabdd1fb5834c5 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 10 Nov 2025 09:09:37 +0000
Subject: [PATCH 12/24] [gn build] Port 152bda726958

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 27bd2ce9849f6..66531c706ac7b 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1432,7 +1432,6 @@ if (current_toolchain == default_toolchain) {
       "__tuple/tuple_like.h",
       "__tuple/tuple_like_no_subrange.h",
       "__tuple/tuple_size.h",
-      "__tuple/tuple_types.h",
       "__type_traits/add_cv_quals.h",
       "__type_traits/add_pointer.h",
       "__type_traits/add_reference.h",

From 471dbb90cb35e45cb33512b5022b4d02f319b54e Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 10 Nov 2025 11:00:05 +0100
Subject: [PATCH 13/24] [libc++] Replace __libcpp_is_final with a variable
 template (#167137)

---
 libcxx/include/__exception/nested_exception.h                   | 2 +-
 libcxx/include/__memory/compressed_pair.h                       | 2 +-
 libcxx/include/__type_traits/is_final.h                         | 2 +-
 libcxx/include/tuple                                            | 2 +-
 .../util.smartptr.shared/libcxx.control_block_layout.pass.cpp   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h
index 90b14158d57a2..dc3266a27cdfd 100644
--- a/libcxx/include/__exception/nested_exception.h
+++ b/libcxx/include/__exception/nested_exception.h
@@ -73,7 +73,7 @@ template <class _Tp>
   __throw_with_nested<_Tp,
                       _Up,
                       is_class<_Up>::value && !is_base_of<nested_exception, _Up>::value &&
-                          !__libcpp_is_final<_Up>::value>::__do_throw(std::forward<_Tp>(__t));
+                          !__is_final_v<_Up> >::__do_throw(std::forward<_Tp>(__t));
 #else
   ((void)__t);
   // FIXME: Make this abort
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index 0388d752ccc8b..f1f1c920453cf 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -67,7 +67,7 @@ inline const size_t __compressed_pair_alignment<_Tp&> = _LIBCPP_ALIGNOF(void*);
 
 template <class _ToPad>
 inline const bool __is_reference_or_unpadded_object =
-    (is_empty<_ToPad>::value && !__libcpp_is_final<_ToPad>::value) || sizeof(_ToPad) == __datasizeof_v<_ToPad>;
+    (is_empty<_ToPad>::value && !__is_final_v<_ToPad>) || sizeof(_ToPad) == __datasizeof_v<_ToPad>;
 
 template <class _Tp>
 inline const bool __is_reference_or_unpadded_object<_Tp&> = true;
diff --git a/libcxx/include/__type_traits/is_final.h b/libcxx/include/__type_traits/is_final.h
index e9ef1425c9760..ab1cace52c4f6 100644
--- a/libcxx/include/__type_traits/is_final.h
+++ b/libcxx/include/__type_traits/is_final.h
@@ -19,7 +19,7 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-struct __libcpp_is_final : integral_constant<bool, __is_final(_Tp)> {};
+inline const bool __is_final_v = __is_final(_Tp);
 
 #if _LIBCPP_STD_VER >= 14
 template <class _Tp>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index a960b64a71763..0cfcd9a4fd9c5 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -347,7 +347,7 @@ using __tuple_common_comparison_category _LIBCPP_NODEBUG =
 
 // __tuple_leaf
 
-template <size_t _Ip, class _Hp, bool = is_empty<_Hp>::value && !__libcpp_is_final<_Hp>::value >
+template <size_t _Ip, class _Hp, bool = is_empty<_Hp>::value && !__is_final_v<_Hp> >
 class __tuple_leaf;
 
 template <size_t _Ip, class _Hp, bool _Ep>
diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
index 0b48bc92f02af..9cb5b2ffbae97 100644
--- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
+++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/libcxx.control_block_layout.pass.cpp
@@ -30,7 +30,7 @@
 
 struct value_init_tag {};
 
-template <class T, int _Idx, bool CanBeEmptyBase = std::is_empty<T>::value && !std::__libcpp_is_final<T>::value>
+template <class T, int _Idx, bool CanBeEmptyBase = std::is_empty<T>::value && !std::__is_final_v<T>>
 struct compressed_pair_elem {
   explicit compressed_pair_elem(value_init_tag) : value_() {}
 

From 0b52b829552f9f5cc9712c2f5047c3ebedccacb7 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 10 Nov 2025 11:00:59 +0100
Subject: [PATCH 14/24] [libc++] Merge insert/emplace(const_iterator, Args...)
 implementations (#166470)

---
 libcxx/include/deque | 116 +++++--------------------------------------
 1 file changed, 12 insertions(+), 104 deletions(-)

diff --git a/libcxx/include/deque b/libcxx/include/deque
index ab41b9db9de26..cbf4b98e07a5b 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -779,6 +779,10 @@ public:
   // 23.2.2.3 modifiers:
   _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v);
   _LIBCPP_HIDE_FROM_ABI void push_back(const value_type& __v);
+
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI iterator __emplace(const_iterator __p, _Args&&... __args);
+
 #  ifndef _LIBCPP_CXX03_LANG
 #    if _LIBCPP_STD_VER >= 17
   template <class... _Args>
@@ -791,8 +795,11 @@ public:
   template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI void emplace_back(_Args&&... __args);
 #    endif
+
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args);
+  _LIBCPP_HIDE_FROM_ABI iterator emplace(const_iterator __p, _Args&&... __args) {
+    return __emplace(__p, std::forward<_Args>(__args)...);
+  }
 
   _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v);
   _LIBCPP_HIDE_FROM_ABI void push_back(value_type&& __v);
@@ -809,13 +816,13 @@ public:
   }
 #    endif
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v);
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, value_type&& __v) { return __emplace(__p, std::move(__v)); }
 
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, initializer_list<value_type> __il) {
     return insert(__p, __il.begin(), __il.end());
   }
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v);
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, const value_type& __v) { return __emplace(__p, __v); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, size_type __n, const value_type& __v);
   template <class _InputIter, __enable_if_t<__has_exactly_input_iterator_category<_InputIter>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __p, _InputIter __f, _InputIter __l);
@@ -1661,56 +1668,11 @@ deque<_Tp, _Allocator>::emplace_front(_Args&&... __args) {
   return *begin();
 #    endif
 }
-
-template <class _Tp, class _Allocator>
-typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, value_type&& __v) {
-  size_type __pos     = __p - begin();
-  size_type __to_end  = size() - __pos;
-  allocator_type& __a = __alloc();
-  if (__pos < __to_end) { // insert by shifting things backward
-    if (__front_spare() == 0)
-      __add_front_capacity();
-    // __front_spare() >= 1
-    __annotate_increase_front(1);
-    if (__pos == 0) {
-      __alloc_traits::construct(__a, std::addressof(*--begin()), std::move(__v));
-      --__start_;
-      ++__size();
-    } else {
-      iterator __b   = begin();
-      iterator __bm1 = std::prev(__b);
-      __alloc_traits::construct(__a, std::addressof(*__bm1), std::move(*__b));
-      --__start_;
-      ++__size();
-      if (__pos > 1)
-        __b = std::move(std::next(__b), __b + __pos, __b);
-      *__b = std::move(__v);
-    }
-  } else { // insert by shifting things forward
-    if (__back_spare() == 0)
-      __add_back_capacity();
-    // __back_capacity >= 1
-    __annotate_increase_back(1);
-    size_type __de = size() - __pos;
-    if (__de == 0) {
-      __alloc_traits::construct(__a, std::addressof(*end()), std::move(__v));
-      ++__size();
-    } else {
-      iterator __e   = end();
-      iterator __em1 = std::prev(__e);
-      __alloc_traits::construct(__a, std::addressof(*__e), std::move(*__em1));
-      ++__size();
-      if (__de > 1)
-        __e = std::move_backward(__e - __de, __em1, __e);
-      *--__e = std::move(__v);
-    }
-  }
-  return begin() + __pos;
-}
+#  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
-typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::emplace(const_iterator __p, _Args&&... __args) {
+typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::__emplace(const_iterator __p, _Args&&... __args) {
   size_type __pos     = __p - begin();
   size_type __to_end  = size() - __pos;
   allocator_type& __a = __alloc();
@@ -1757,60 +1719,6 @@ typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::emplace(const_
   return begin() + __pos;
 }
 
-#  endif // _LIBCPP_CXX03_LANG
-
-template <class _Tp, class _Allocator>
-typename deque<_Tp, _Allocator>::iterator deque<_Tp, _Allocator>::insert(const_iterator __p, const value_type& __v) {
-  size_type __pos     = __p - begin();
-  size_type __to_end  = size() - __pos;
-  allocator_type& __a = __alloc();
-  if (__pos < __to_end) { // insert by shifting things backward
-    if (__front_spare() == 0)
-      __add_front_capacity();
-    // __front_spare() >= 1
-    __annotate_increase_front(1);
-    if (__pos == 0) {
-      __alloc_traits::construct(__a, std::addressof(*--begin()), __v);
-      --__start_;
-      ++__size();
-    } else {
-      const_pointer __vt = pointer_traits<const_pointer>::pointer_to(__v);
-      iterator __b       = begin();
-      iterator __bm1     = std::prev(__b);
-      if (__vt == pointer_traits<const_pointer>::pointer_to(*__b))
-        __vt = pointer_traits<const_pointer>::pointer_to(*__bm1);
-      __alloc_traits::construct(__a, std::addressof(*__bm1), std::move(*__b));
-      --__start_;
-      ++__size();
-      if (__pos > 1)
-        __b = __move_and_check(std::next(__b), __b + __pos, __b, __vt);
-      *__b = *__vt;
-    }
-  } else { // insert by shifting things forward
-    if (__back_spare() == 0)
-      __add_back_capacity();
-    // __back_capacity >= 1
-    __annotate_increase_back(1);
-    size_type __de = size() - __pos;
-    if (__de == 0) {
-      __alloc_traits::construct(__a, std::addressof(*end()), __v);
-      ++__size();
-    } else {
-      const_pointer __vt = pointer_traits<const_pointer>::pointer_to(__v);
-      iterator __e       = end();
-      iterator __em1     = std::prev(__e);
-      if (__vt == pointer_traits<const_pointer>::pointer_to(*__em1))
-        __vt = pointer_traits<const_pointer>::pointer_to(*__e);
-      __alloc_traits::construct(__a, std::addressof(*__e), std::move(*__em1));
-      ++__size();
-      if (__de > 1)
-        __e = __move_backward_and_check(__e - __de, __em1, __e, __vt);
-      *--__e = *__vt;
-    }
-  }
-  return begin() + __pos;
-}
-
 template <class _Tp, class _Allocator>
 typename deque<_Tp, _Allocator>::iterator
 deque<_Tp, _Allocator>::insert(const_iterator __p, size_type __n, const value_type& __v) {

From 57dad86cb3dbbaf0da5aac6b3d3c5d5e1bae64fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Mon, 10 Nov 2025 11:02:53 +0100
Subject: [PATCH 15/24] [SPIRV] Fix failing assertion in SPIRVAsmPrinter
 (#166909)

With `+SPV_KHR_float_controls2` and when there is a non-int
`OpConstantNull` we
would call `MI.getOperand(1).getImm()` when `MI` was not an `OpTypeInt`
(the
associated test has an `OpTypeArray` zeroinitialized).
Under this conditions an assertion is triggered.

This patch adds the missing condition.
---
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp     | 23 +++++++++--------
 .../CodeGen/SPIRV/non_int_constant_null.ll    | 25 +++++++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/non_int_constant_null.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 0175f2fb3698b..970b83de5ee33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -612,13 +612,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
         // Collect the SPIRVTypes for fp16, fp32, and fp64 and the constant of
         // type int32 with 0 value to represent the FP Fast Math Mode.
         std::vector<const MachineInstr *> SPIRVFloatTypes;
-        const MachineInstr *ConstZero = nullptr;
+        const MachineInstr *ConstZeroInt32 = nullptr;
         for (const MachineInstr *MI :
              MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) {
-          // Skip if the instruction is not OpTypeFloat or OpConstant.
           unsigned OpCode = MI->getOpcode();
-          if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantNull)
-            continue;
 
           // Collect the SPIRV type if it's a float.
           if (OpCode == SPIRV::OpTypeFloat) {
@@ -629,14 +626,18 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
               continue;
             }
             SPIRVFloatTypes.push_back(MI);
-          } else {
+            continue;
+          }
+
+          if (OpCode == SPIRV::OpConstantNull) {
             // Check if the constant is int32, if not skip it.
             const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
             MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg());
-            if (!TypeMI || TypeMI->getOperand(1).getImm() != 32)
-              continue;
-
-            ConstZero = MI;
+            bool IsInt32Ty = TypeMI &&
+                             TypeMI->getOpcode() == SPIRV::OpTypeInt &&
+                             TypeMI->getOperand(1).getImm() == 32;
+            if (IsInt32Ty)
+              ConstZeroInt32 = MI;
           }
         }
 
@@ -657,9 +658,9 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
           MCRegister TypeReg =
               MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
           Inst.addOperand(MCOperand::createReg(TypeReg));
-          assert(ConstZero && "There should be a constant zero.");
+          assert(ConstZeroInt32 && "There should be a constant zero.");
           MCRegister ConstReg = MAI->getRegisterAlias(
-              ConstZero->getMF(), ConstZero->getOperand(0).getReg());
+              ConstZeroInt32->getMF(), ConstZeroInt32->getOperand(0).getReg());
           Inst.addOperand(MCOperand::createReg(ConstReg));
           outputMCInst(Inst);
         }
diff --git a/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll
new file mode 100644
index 0000000000000..0ba016aaa30aa
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - -filetype=obj | spirv-val %}
+
+@A = addrspace(1) constant [1 x i8] zeroinitializer
+
+; CHECK: OpName %[[#FOO:]] "foo"
+; CHECK: OpName %[[#A:]] "A"
+; CHECK: OpDecorate %[[#A]] Constant
+; CHECK: OpDecorate %[[#A]] LinkageAttributes "A" Export
+; CHECK: %[[#INT8:]] = OpTypeInt 8 0
+; CHECK: %[[#INT32:]] = OpTypeInt 32 0
+; CHECK: %[[#ONE:]] = OpConstant %[[#INT32]] 1
+; CHECK: %[[#ARR_INT8:]] = OpTypeArray %[[#INT8]] %7
+; CHECK: %[[#ARR_INT8_PTR:]] = OpTypePointer CrossWorkgroup %[[#ARR_INT8]]
+; CHECK: %[[#ARR_INT8_ZERO:]] = OpConstantNull %[[#ARR_INT8]]
+; CHECK: %13 = OpVariable %[[#ARR_INT8_PTR]] CrossWorkgroup %[[#ARR_INT8_ZERO]]
+; CHECK: %[[#FOO]] = OpFunction
+; CHECK: = OpLabel
+; CHECK: OpReturn
+; CHECK: OpFunctionEnd
+
+define spir_kernel void @foo() {
+entry:
+  ret void
+}

From d84a911e7e752e6337d1c0113818942b5b9350f5 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 10 Nov 2025 10:07:15 +0000
Subject: [PATCH 16/24] [AArch64][SVE] Avoid redundant extend of unsigned
 i8/i16 extracts. (#165863)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extracts of unsigned i8 or i16 elements from the bottom 128 bits of a
scalable register lead to the implied zero-extend being transformed to
an AND mask. The mask is redundant since UMOV already zeroes the high
bits of the destination register.

For example:
```c
int foo(svuint8_t x) {
  return x[3];
}
```
Currently:
```gas
foo:
  umov    w8, v0.b[3]
  and     w0, w8, #0xff
  ret
```
Becomes:
```gas
foo:
  umov    w0, v0.b[3]
  ret
```
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  12 ++
 .../CodeGen/AArch64/sve-extract-element.ll    | 132 ++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index e1f43867bbe5b..65b6077894673 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3597,6 +3597,18 @@ let Predicates = [HasSVE_or_SME] in {
 
   def : Pat<(sext (i32 (vector_extract nxv4i32:$vec, VectorIndexS:$index))),
             (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index)>;
+
+  // Extracts of ``unsigned'' i8 or i16 elements lead to the zero-extend being
+  // transformed to an AND mask. The mask is redundant since UMOV already zeroes
+  // the high bits of the destination register.
+  def : Pat<(i32 (and (vector_extract nxv16i8:$vec, VectorIndexB:$index), 0xff)),
+            (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)>;
+  def : Pat<(i32 (and (vector_extract nxv8i16:$vec, VectorIndexH:$index), 0xffff)),
+            (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)>;
+  def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv16i8:$vec, VectorIndexB:$index)))), (i64 0xff))),
+            (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index)), sub_32)>;
+  def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract nxv8i16:$vec, VectorIndexH:$index)))), (i64 0xffff))),
+            (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index)), sub_32)>;
   } // End HasNEON
 
   // Extract first element from vector.
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index c340df1385124..0cc2e04bfb315 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -12,6 +12,26 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 {
   ret i8 %b
 }
 
+define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 0
+  %c = zext i8 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane0_16xi8_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 0
+  %c = zext i8 %b to i64
+  ret i64 %c
+}
+
 define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: test_lane15_16xi8:
 ; CHECK:       // %bb.0:
@@ -21,6 +41,26 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 {
   ret i8 %b
 }
 
+define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[15]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 15
+  %c = zext i8 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane15_16xi8_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.b[15]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 15
+  %c = zext i8 %b to i64
+  ret i64 %c
+}
+
 define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: test_lane16_16xi8:
 ; CHECK:       // %bb.0:
@@ -31,6 +71,32 @@ define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 {
   ret i8 %b
 }
 
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane16_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, z0.b[16]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 16
+  %c = zext i8 %b to i32
+  ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane16_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: test_lane16_16xi8_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, z0.b[16]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 16 x i8> %a, i32 16
+  %c = zext i8 %b to i64
+  ret i64 %c
+}
+
 define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane0_8xi16:
 ; CHECK:       // %bb.0:
@@ -40,6 +106,26 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 {
   ret i16 %b
 }
 
+define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 0
+  %c = zext i16 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane0_8xi16_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 0
+  %c = zext i16 %b to i64
+  ret i64 %c
+}
+
 define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane7_8xi16:
 ; CHECK:       // %bb.0:
@@ -49,6 +135,26 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 {
   ret i16 %b
 }
 
+define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[7]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 7
+  %c = zext i16 %b to i32
+  ret i32 %c
+}
+
+define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane7_8xi16_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umov w0, v0.h[7]
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 7
+  %c = zext i16 %b to i64
+  ret i64 %c
+}
+
 define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: test_lane8_8xi16:
 ; CHECK:       // %bb.0:
@@ -59,6 +165,32 @@ define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 {
   ret i16 %b
 }
 
+; FIXME: FMOV+AND -> UMOV.
+define i32 @test_lane8_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 8
+  %c = zext i16 %b to i32
+  ret i32 %c
+}
+
+; FIXME: FMOV+AND -> UMOV.
+define i64 @test_lane8_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: test_lane8_8xi16_zext_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, z0.h[8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ret
+  %b = extractelement <vscale x 8 x i16> %a, i32 8
+  %c = zext i16 %b to i64
+  ret i64 %c
+}
+
 define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: test_lane0_4xi32:
 ; CHECK:       // %bb.0:

From ff1efe9e7310fb57a96bb27caddc185779120f43 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 10 Nov 2025 10:26:05 +0000
Subject: [PATCH 17/24] [AArch64] Combine subtract with borrow to SBC.
 (#165271)

Specifically, this patch adds the following combines:
  SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
  SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)

The CSET may be preceded by a ZEXT.

Fixes #164748.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  33 ++
 llvm/test/CodeGen/AArch64/sbc.ll              | 392 ++++++++++++++++++
 2 files changed, 425 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sbc.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 76a790dc2dbc9..8457f6178fdc2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22308,6 +22308,37 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
 }
 
+// Attempt to combine the following patterns:
+//   SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
+//   SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
+// The CSET may be preceded by a ZEXT.
+static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() != ISD::SUB)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
+    N1 = N1.getOperand(0);
+  if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO)
+    return SDValue();
+
+  SDValue Flags = N1.getOperand(3);
+  if (Flags.getOpcode() != AArch64ISD::SUBS)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  if (N0->getOpcode() == ISD::SUB)
+    return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
+                       N0.getOperand(1), Flags);
+  return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),
+                     Flags);
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -22329,6 +22360,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
     return Val;
+  if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
+    return Val;
 
   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
     return Val;
diff --git a/llvm/test/CodeGen/AArch64/sbc.ll b/llvm/test/CodeGen/AArch64/sbc.ll
new file mode 100644
index 0000000000000..fff63c1709218
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sbc.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc < %s -global-isel | FileCheck --check-prefixes=CHECK,CHECK-GI %s
+
+target triple = "aarch64-none-linux-gnu"
+
+define i32 @test_basic_i32(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_basic_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_basic_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i64 @test_basic_i64(i64 %a, i64 %b, i64 %x, i64 %y) {
+; CHECK-SD-LABEL: test_basic_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, x1
+; CHECK-SD-NEXT:    sbc x0, x2, x3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_basic_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp x0, x1
+; CHECK-GI-NEXT:    sub x9, x2, x3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub x0, x9, x8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i64 %a, %b
+  %carry = zext i1 %cc to i64
+  %sub = sub i64 %x, %y
+  %res = sub i64 %sub, %carry
+  ret i64 %res
+}
+
+define i64 @test_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) {
+; CHECK-SD-LABEL: test_mixed_i32_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc x0, x2, x3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_mixed_i32_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub x9, x2, x3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub x0, x9, x8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i64
+  %sub = sub i64 %x, %y
+  %res = sub i64 %sub, %carry
+  ret i64 %res
+}
+
+define i32 @test_mixed_i64_i32(i64 %a, i64 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_mixed_i64_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp x0, x1
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_mixed_i64_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp x0, x1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i64 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_only_borrow(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_only_borrow:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc w0, w2, wzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_only_borrow:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w2, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %res = sub i32 %x, %carry
+  ret i32 %res
+}
+
+define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_sext_add:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_sext_add:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sbfx w8, w8, #0, #1
+; CHECK-GI-NEXT:    add w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = sext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = add i32 %sub, %carry
+  ret i32 %res
+}
+
+; FIXME: This case could be supported with reversed operands to the CMP.
+define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_ugt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cset w9, hi
+; CHECK-SD-NEXT:    sub w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_ugt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, hi
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ugt i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_unsupported_cc_slt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_unsupported_cc_slt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cset w9, lt
+; CHECK-SD-NEXT:    sub w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_unsupported_cc_slt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp slt i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_unsupported_cc_sgt(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_unsupported_cc_sgt:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cset w9, gt
+; CHECK-SD-NEXT:    sub w0, w8, w9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_unsupported_cc_sgt:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cset w8, gt
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp sgt i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  ret i32 %res
+}
+
+define i32 @test_multiple_setcc_uses(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_multiple_setcc_uses:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    sub w19, w2, w0
+; CHECK-SD-NEXT:    bl use
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_multiple_setcc_uses:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    mov w19, w2
+; CHECK-GI-NEXT:    cset w20, lo
+; CHECK-GI-NEXT:    mov w0, w20
+; CHECK-GI-NEXT:    bl use
+; CHECK-GI-NEXT:    sub w0, w19, w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %res = sub i32 %x, %carry
+  tail call void @use(i1 %cc)
+  ret i32 %res
+}
+
+define i32 @test_multiple_carry_uses(i32 %a, i32 %b, i32 %x) {
+; CHECK-SD-LABEL: test_multiple_carry_uses:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    cset w0, lo
+; CHECK-SD-NEXT:    sub w19, w2, w0
+; CHECK-SD-NEXT:    bl use
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_multiple_carry_uses:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    mov w19, w2
+; CHECK-GI-NEXT:    cset w20, lo
+; CHECK-GI-NEXT:    mov w0, w20
+; CHECK-GI-NEXT:    bl use
+; CHECK-GI-NEXT:    sub w0, w19, w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %res = sub i32 %x, %carry
+  tail call void @use(i32 %carry)
+  ret i32 %res
+}
+
+define i32 @test_multiple_sub_uses(i32 %a, i32 %b, i32 %x, i32 %y) {
+; CHECK-SD-LABEL: test_multiple_sub_uses:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    sub w8, w2, w3
+; CHECK-SD-NEXT:    cmp w0, w1
+; CHECK-SD-NEXT:    mov w0, w8
+; CHECK-SD-NEXT:    sbc w19, w2, w3
+; CHECK-SD-NEXT:    bl use
+; CHECK-SD-NEXT:    mov w0, w19
+; CHECK-SD-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_multiple_sub_uses:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    sub w19, w2, w3
+; CHECK-GI-NEXT:    cmp w0, w1
+; CHECK-GI-NEXT:    mov w0, w19
+; CHECK-GI-NEXT:    cset w20, lo
+; CHECK-GI-NEXT:    bl use
+; CHECK-GI-NEXT:    sub w0, w19, w20
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i32 %a, %b
+  %carry = zext i1 %cc to i32
+  %sub = sub i32 %x, %y
+  %res = sub i32 %sub, %carry
+  tail call void @use(i32 %sub)
+  ret i32 %res
+}
+
+define i8 @test_i8(i8 %a, i8 %b, i8 %x, i8 %y) {
+; CHECK-SD-LABEL: test_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    cmp w8, w1, uxtb
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cmp w8, w1, uxtb
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i8 %a, %b
+  %carry = zext i1 %cc to i8
+  %sub = sub i8 %x, %y
+  %res = sub i8 %sub, %carry
+  ret i8 %res
+}
+
+define i16 @test_i16(i16 %a, i16 %b, i16 %x, i16 %y) {
+; CHECK-SD-LABEL: test_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    cmp w8, w1, uxth
+; CHECK-SD-NEXT:    sbc w0, w2, w3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w0, #0xffff
+; CHECK-GI-NEXT:    sub w9, w2, w3
+; CHECK-GI-NEXT:    cmp w8, w1, uxth
+; CHECK-GI-NEXT:    cset w8, lo
+; CHECK-GI-NEXT:    sub w0, w9, w8
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult i16 %a, %b
+  %carry = zext i1 %cc to i16
+  %sub = sub i16 %x, %y
+  %res = sub i16 %sub, %carry
+  ret i16 %res
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-SD-LABEL: test_v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v4.4s, #1
+; CHECK-GI-NEXT:    cmhi v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sub v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
+  %cc = icmp ult <4 x i32> %a, %b
+  %carry = zext <4 x i1> %cc to <4 x i32>
+  %sub = sub <4 x i32> %x, %y
+  %res = sub <4 x i32> %sub, %carry
+  ret <4 x i32> %res
+}
+
+declare void @use()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

From 898d6fecf6636521321af17fcd69a19f30fa8cf8 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@mozilla.com>
Date: Mon, 10 Nov 2025 11:30:37 +0100
Subject: [PATCH 18/24] Remove unused <algorithm> inclusion (#166942)

---
 llvm/benchmarks/FormatVariadicBM.cpp                             | 1 +
 llvm/include/llvm/Frontend/OpenMP/ClauseT.h                      | 1 -
 llvm/include/llvm/IR/DiagnosticInfo.h                            | 1 -
 llvm/include/llvm/IR/Dominators.h                                | 1 -
 llvm/include/llvm/IR/ProfileSummary.h                            | 1 -
 llvm/include/llvm/MC/MCAssembler.h                               | 1 -
 llvm/include/llvm/Support/TypeSize.h                             | 1 -
 llvm/include/llvm/Support/UnicodeCharRanges.h                    | 1 -
 llvm/include/llvm/TableGen/DirectiveEmitter.h                    | 1 -
 .../llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h      | 1 -
 llvm/include/llvm/XRay/FDRRecordConsumer.h                       | 1 -
 llvm/lib/CodeGen/RegAllocGreedy.h                                | 1 -
 llvm/lib/Support/Unix/Unix.h                                     | 1 -
 llvm/lib/Support/Windows/Signals.inc                             | 1 -
 llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp                  | 1 -
 llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h     | 1 -
 llvm/lib/Target/X86/X86FlagsCopyLowering.cpp                     | 1 -
 llvm/tools/llvm-stress/llvm-stress.cpp                           | 1 -
 llvm/unittests/ADT/CombinationGeneratorTest.cpp                  | 1 -
 llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp                     | 1 -
 llvm/unittests/ADT/DeltaAlgorithmTest.cpp                        | 1 -
 llvm/unittests/ADT/SequenceTest.cpp                              | 1 -
 llvm/unittests/Analysis/DomTreeUpdaterTest.cpp                   | 1 -
 llvm/unittests/TextAPI/TextStubHelpers.h                         | 1 -
 llvm/utils/KillTheDoctor/KillTheDoctor.cpp                       | 1 -
 25 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp
index 3e851f0975e8b..b451d1079d29b 100644
--- a/llvm/benchmarks/FormatVariadicBM.cpp
+++ b/llvm/benchmarks/FormatVariadicBM.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include <algorithm>
 #include <string>
+#include <vector>
 
 using namespace llvm;
 using namespace std;
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index a86dca06f8ecd..12dfb6c607bb9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -50,7 +50,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <algorithm>
 #include <iterator>
 #include <optional>
 #include <tuple>
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index a426fb079ec04..1c86d181e4375 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -26,7 +26,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TypeSize.h"
-#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <iterator>
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index bf128a3936cbd..1209def5ac0bd 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -32,7 +32,6 @@
 #include "llvm/Support/CFGUpdate.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/GenericDomTree.h"
-#include <algorithm>
 #include <utility>
 
 namespace llvm {
diff --git a/llvm/include/llvm/IR/ProfileSummary.h b/llvm/include/llvm/IR/ProfileSummary.h
index 6c087ea02b3c3..34012151f729f 100644
--- a/llvm/include/llvm/IR/ProfileSummary.h
+++ b/llvm/include/llvm/IR/ProfileSummary.h
@@ -14,7 +14,6 @@
 #define LLVM_IR_PROFILESUMMARY_H
 
 #include "llvm/Support/Compiler.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <vector>
diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 144f2118e715c..152b81e284c1a 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -19,7 +19,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SMLoc.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 0a7ae15edbb33..421d6613bfafc 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -20,7 +20,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <type_traits>
diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h
index 2b5fc83d34690..03515cd61515f 100644
--- a/llvm/include/llvm/Support/UnicodeCharRanges.h
+++ b/llvm/include/llvm/Support/UnicodeCharRanges.h
@@ -12,7 +12,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 
 #define DEBUG_TYPE "unicode"
 
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index ce3e87e470b9d..2080f75eb8cfc 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -20,7 +20,6 @@
 #include "llvm/Frontend/Directive/Spelling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TableGen/Record.h"
-#include <algorithm>
 #include <string>
 #include <vector>
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
index 4385df518a111..050396674e159 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
@@ -19,7 +19,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
-#include <algorithm>
 
 namespace llvm::sandboxir {
 
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 13bb711328fdc..4ff65f043fe17 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -11,7 +11,6 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
-#include <algorithm>
 #include <memory>
 #include <vector>
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 7f013d1f1f726..4affa275cbf8b 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/SpillPlacement.h"
 #include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <queue>
diff --git a/llvm/lib/Support/Unix/Unix.h b/llvm/lib/Support/Unix/Unix.h
index a1d44c69ab1ab..f24d524982b23 100644
--- a/llvm/lib/Support/Unix/Unix.h
+++ b/llvm/lib/Support/Unix/Unix.h
@@ -22,7 +22,6 @@
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/ErrorHandling.h"
-#include <algorithm>
 #include <assert.h>
 #include <cerrno>
 #include <cstdio>
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index da68994970ebb..bacbb76e09e6c 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -16,7 +16,6 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/WindowsError.h"
-#include <algorithm>
 #include <io.h>
 #include <signal.h>
 #include <stdio.h>
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index 7885d93cbad98..a2cf0a57675c7 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -48,7 +48,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2573066cd5d63..4146c0ec6ab07 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -21,7 +21,6 @@
 
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include <algorithm>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index ab6e6d0687b71..b3bf37a9a462c 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -50,7 +50,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <iterator>
 #include <utility>
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 133812e419d2b..2fe5d6b7e5254 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -40,7 +40,6 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/llvm/unittests/ADT/CombinationGeneratorTest.cpp b/llvm/unittests/ADT/CombinationGeneratorTest.cpp
index f3e174a83ba69..219e18bc5e12c 100644
--- a/llvm/unittests/ADT/CombinationGeneratorTest.cpp
+++ b/llvm/unittests/ADT/CombinationGeneratorTest.cpp
@@ -12,7 +12,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 #include <cstddef>
 #include <iterator>
 #include <vector>
diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
index 918a2e63da935..9f5149099e309 100644
--- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ADT/DAGDeltaAlgorithm.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 #include <cstdarg>
 using namespace llvm;
 
diff --git a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
index 24e18f42eb33c..530bd1cb51173 100644
--- a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ADT/DeltaAlgorithm.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 #include <cstdarg>
 using namespace llvm;
 
diff --git a/llvm/unittests/ADT/SequenceTest.cpp b/llvm/unittests/ADT/SequenceTest.cpp
index 7b7dc85cb79be..7aa39568888b2 100644
--- a/llvm/unittests/ADT/SequenceTest.cpp
+++ b/llvm/unittests/ADT/SequenceTest.cpp
@@ -11,7 +11,6 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-#include <algorithm>
 #include <numeric>
 
 using namespace llvm;
diff --git a/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp b/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
index cabfc2aba57cf..9f5fe5742a44d 100644
--- a/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
+++ b/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "gtest/gtest.h"
-#include <algorithm>
 
 using namespace llvm;
 
diff --git a/llvm/unittests/TextAPI/TextStubHelpers.h b/llvm/unittests/TextAPI/TextStubHelpers.h
index 7c9c74a252760..87ca7e1c0b4d4 100644
--- a/llvm/unittests/TextAPI/TextStubHelpers.h
+++ b/llvm/unittests/TextAPI/TextStubHelpers.h
@@ -8,7 +8,6 @@
 
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TextAPI/InterfaceFile.h"
-#include <algorithm>
 #include <string>
 
 #ifndef TEXT_STUB_HELPERS_H
diff --git a/llvm/utils/KillTheDoctor/KillTheDoctor.cpp b/llvm/utils/KillTheDoctor/KillTheDoctor.cpp
index 4f642f8886df5..0495560b6c1dc 100644
--- a/llvm/utils/KillTheDoctor/KillTheDoctor.cpp
+++ b/llvm/utils/KillTheDoctor/KillTheDoctor.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/type_traits.h"
-#include <algorithm>
 #include <cerrno>
 #include <cstdlib>
 #include <map>

From 2d1d5fe78ed01810c89f3705acfe93a7e219c08f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 10 Nov 2025 10:43:37 +0000
Subject: [PATCH 19/24] [VPlan] Simplify branch-cond with getVectorTripCount
 (#155604)

Call getVectorTripCount first, and call getTripCount failing that, in
simplifyBranchConditionForVFAndUF, to simplify missed cases. While at
it, strip the dead check for a zero TC.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 18 +++++-----
 .../AArch64/partial-reduce-dot-product.ll     |  3 +-
 .../LoopVectorize/RISCV/low-trip-count.ll     |  3 +-
 .../X86/limit-vf-by-tripcount.ll              | 33 ++++++++-----------
 .../LoopVectorize/X86/load-deref-pred.ll      |  3 +-
 .../vector-loop-backedge-elimination.ll       | 32 +++++++-----------
 6 files changed, 37 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 634df51a12965..b319fbc7a78c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1745,17 +1745,17 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
   if (match(Term, m_BranchOnCount()) ||
       match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
                       m_VPValue(), m_VPValue(), m_VPValue()))))) {
-    // Try to simplify the branch condition if TC <= VF * UF when the latch
-    // terminator is   BranchOnCount or BranchOnCond where the input is
-    // Not(ActiveLaneMask).
-    const SCEV *TripCount =
-        vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
-    assert(!isa<SCEVCouldNotCompute>(TripCount) &&
+    // Try to simplify the branch condition if VectorTC <= VF * UF when the
+    // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
+    const SCEV *VectorTripCount =
+        vputils::getSCEVExprForVPValue(&Plan.getVectorTripCount(), SE);
+    if (isa<SCEVCouldNotCompute>(VectorTripCount))
+      VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
+    assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
            "Trip count SCEV must be computable");
     ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
-    const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
-    if (TripCount->isZero() ||
-        !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
+    const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
+    if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
       return false;
   } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
     // For BranchOnCond, check if we can prove the condition to be true using VF
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index d77ca9875bf01..37eac89acfd11 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1589,8 +1589,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 8ef53cade01ac..345f6f632158a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -295,8 +295,7 @@ define i8 @mul_non_pow_2_low_trip_count(ptr noalias %a) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP1]] = mul <8 x i8> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> [[TMP1]])
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index c1272e56836f8..6e3b2a5390948 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -12,27 +12,22 @@ define void @test_tc_17_no_epilogue_vectorization(ptr noalias %src, ptr noalias
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 64
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1:%.*]], align 64
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3:%.*]], align 64
+; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 16, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I]]
+; CHECK-NEXT:    [[LDADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I]]
 ; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[LDADDR]], align 64
-; CHECK-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I]]
+; CHECK-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[I]]
 ; CHECK-NEXT:    store i8 [[VAL]], ptr [[STADDR]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 17
-; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -69,11 +64,11 @@ define void @test_tc_18(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -140,7 +135,7 @@ define void @test_tc_19(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -219,7 +214,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -231,7 +226,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD5]], ptr [[TMP15]], align 64
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 20
-; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -245,7 +240,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    store i8 [[VAL]], ptr [[STADDR]], align 64
 ; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[IS_NEXT:%.*]] = icmp ult i64 [[I_NEXT]], 20
-; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[IS_NEXT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -281,7 +276,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src,
 ; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -294,7 +289,7 @@ define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src,
 ; CHECK-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 8771dc9a20379..6605338771c47 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -2581,8 +2581,7 @@ define i32 @test_non_unit_stride_five(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]]
 ; CHECK-NEXT:    [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index e160a15ece47d..bba459f776050 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -1140,18 +1140,14 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF8UF2:       [[VECTOR_BODY]]:
-; VF8UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8
-; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[A]], align 1
 ; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
 ; VF8UF2-NEXT:    [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
 ; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
-; VF8UF2-NEXT:    store <8 x i8> [[TMP2]], ptr [[NEXT_GEP]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP2]], ptr [[A]], align 1
 ; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[TMP1]], align 1
-; VF8UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF8UF2-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br label %[[SCALAR_PH:.*]]
 ; VF8UF2:       [[SCALAR_PH]]:
@@ -1165,7 +1161,7 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
 ; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    ret void
 ;
@@ -1177,14 +1173,10 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF16UF1:       [[VECTOR_BODY]]:
-; VF16UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF16UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[A]], align 1
 ; VF16UF1-NEXT:    [[TMP1:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
-; VF16UF1-NEXT:    store <16 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1
-; VF16UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF16UF1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF16UF1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1-NEXT:    store <16 x i8> [[TMP1]], ptr [[A]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br label %[[SCALAR_PH:.*]]
 ; VF16UF1:       [[SCALAR_PH]]:
@@ -1198,7 +1190,7 @@ define void @test_vector_tc_eq_16(ptr %A) {
 ; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
 ; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF16UF1:       [[EXIT]]:
 ; VF16UF1-NEXT:    ret void
 ;
@@ -1232,12 +1224,10 @@ exit:
 ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF8UF2: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ;.
 ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; VF16UF1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
-; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ;.

From 80fa6e1bcea054109a154916e37ccfe7b9b0a9fb Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 10 Nov 2025 10:50:15 +0000
Subject: [PATCH 20/24] [DropAssumes] Drop dereferenceable assumptions after
 vectorization. (#166947)

This patch adds another run of DropUnnecessaryAssumes after
vectorization, to clean up assumes that are not longer needed after this
point.

The main example of such an assume is currently dereferenceable
assumptions. This complements
https://github.com/llvm/llvm-project/pull/166945, which avoids sinking
code if it would mean remove a dereferenceable assumption.

There are a few additional cases where some unneeded assumes are left
over after vectorization that also get cleaned up.

The main motivation is to work together with
https://github.com/llvm/llvm-project/pull/166945, but there may be a
better solution.

Adding another instance of this pass to the pipeline is not great, but
compile-time impact seems in the noise:
https://llvm-compile-time-tracker.com/compare.php?from=55e71fe08b6406ec7ce2c81ce042e48717acf204&to=85da4ee3a74126f557cdc74c7b40e048dacb3fc4&stat=instructions:u

PR: https://github.com/llvm/llvm-project/pull/166947
---
 llvm/include/llvm/Passes/PassBuilder.h        |  2 +-
 .../Scalar/DropUnnecessaryAssumes.h           |  6 ++
 llvm/lib/Passes/PassBuilder.cpp               |  5 ++
 llvm/lib/Passes/PassBuilderPipelines.cpp      | 14 +++-
 llvm/lib/Passes/PassRegistry.def              |  5 +-
 .../Scalar/DropUnnecessaryAssumes.cpp         |  7 +-
 llvm/test/Other/new-pm-defaults.ll            |  1 +
 llvm/test/Other/new-pm-lto-defaults.ll        |  1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |  1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |  1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |  1 +
 .../DropUnnecessaryAssumes/dereferenceable.ll | 54 +++++++++++++++
 .../AArch64/matrix-extract-insert.ll          | 68 -------------------
 .../PhaseOrdering/AArch64/std-find.ll         |  8 +--
 14 files changed, 95 insertions(+), 79 deletions(-)
 create mode 100644 llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll

diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 8538a8b2afe14..8fa21f2cb2dd6 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -742,7 +742,7 @@ class PassBuilder {
   void addRequiredLTOPreLinkPasses(ModulePassManager &MPM);
 
   void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM,
-                       bool IsFullLTO);
+                       ThinOrFullLTOPhase LTOPhase);
 
   static std::optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
diff --git a/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h b/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h
index 4ff442ff80c76..54ddcc09f7204 100644
--- a/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h
+++ b/llvm/include/llvm/Transforms/Scalar/DropUnnecessaryAssumes.h
@@ -19,7 +19,13 @@ namespace llvm {
 
 struct DropUnnecessaryAssumesPass
     : public PassInfoMixin<DropUnnecessaryAssumesPass> {
+  DropUnnecessaryAssumesPass(bool DropDereferenceable = false)
+      : DropDereferenceable(DropDereferenceable) {}
+
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  bool DropDereferenceable;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 40ceb6f6ae28f..e0babc4385aab 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -900,6 +900,11 @@ Expected<bool> parseEntryExitInstrumenterPassOptions(StringRef Params) {
                                             "EntryExitInstrumenter");
 }
 
+Expected<bool> parseDropUnnecessaryAssumesPassOptions(StringRef Params) {
+  return PassBuilder::parseSinglePassOption(Params, "drop-deref",
+                                            "DropUnnecessaryAssumes");
+}
+
 Expected<bool> parseLoopExtractorPassOptions(StringRef Params) {
   return PassBuilder::parseSinglePassOption(Params, "single", "LoopExtractor");
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3f41618b18fcf..2fe963b3b68d9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1298,10 +1298,18 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
 
 /// TODO: Should LTO cause any differences to this set of passes?
 void PassBuilder::addVectorPasses(OptimizationLevel Level,
-                                  FunctionPassManager &FPM, bool IsFullLTO) {
+                                  FunctionPassManager &FPM,
+                                  ThinOrFullLTOPhase LTOPhase) {
+  const bool IsFullLTO = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink;
+
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
+  // Drop dereferenceable assumes after vectorization, as they are no longer
+  // needed and can inhibit further optimization.
+  if (!isLTOPreLink(LTOPhase))
+    FPM.addPass(DropUnnecessaryAssumesPass(/*DropDereferenceable=*/true));
+
   FPM.addPass(InferAlignmentPass());
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
@@ -1572,7 +1580,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // from the TargetLibraryInfo.
   OptimizePM.addPass(InjectTLIMappings());
 
-  addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
+  addVectorPasses(Level, OptimizePM, LTOPhase);
 
   invokeVectorizerEndEPCallbacks(OptimizePM, Level);
 
@@ -2162,7 +2170,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   MainFPM.addPass(LoopDistributePass());
 
-  addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
+  addVectorPasses(Level, MainFPM, ThinOrFullLTOPhase::FullLTOPostLink);
 
   invokeVectorizerEndEPCallbacks(MainFPM, Level);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d870f99aad552..d8305fe5c8e73 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -432,7 +432,6 @@ FUNCTION_PASS("dot-post-dom", PostDomPrinter())
 FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(*TM))
-FUNCTION_PASS("drop-unnecessary-assumes", DropUnnecessaryAssumesPass())
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(*TM))
 FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(*TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
@@ -584,6 +583,10 @@ FUNCTION_PASS_WITH_PARAMS(
     "early-cse", "EarlyCSEPass",
     [](bool UseMemorySSA) { return EarlyCSEPass(UseMemorySSA); },
     parseEarlyCSEPassOptions, "memssa")
+FUNCTION_PASS_WITH_PARAMS(
+    "drop-unnecessary-assumes", "DropUnnecessaryAssumesPass",
+    [](bool DropDereferenceable) { return DropUnnecessaryAssumesPass(DropDereferenceable); },
+    parseDropUnnecessaryAssumesPassOptions, "drop-deref")
 FUNCTION_PASS_WITH_PARAMS(
     "ee-instrument", "EntryExitInstrumenterPass",
     [](bool PostInlining) { return EntryExitInstrumenterPass(PostInlining); },
diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
index a577f517d1e89..4a7144fe6c77a 100644
--- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
+++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
@@ -78,11 +78,16 @@ DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) {
       SmallVector<OperandBundleDef> KeptBundles;
       unsigned NumBundles = Assume->getNumOperandBundles();
       for (unsigned I = 0; I != NumBundles; ++I) {
-        auto IsDead = [](OperandBundleUse Bundle) {
+        auto IsDead = [&](OperandBundleUse Bundle) {
           // "ignore" operand bundles are always dead.
           if (Bundle.getTagName() == "ignore")
             return true;
 
+          // "dereferenceable" operand bundles are only dropped if requested
+          // (e.g., after loop vectorization has run).
+          if (Bundle.getTagName() == "dereferenceable")
+            return DropDereferenceable;
+
           // Bundles without arguments do not affect any specific values.
           // Always keep them for now.
           if (Bundle.Inputs.empty())
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 62975a3cf8ac4..b59d4cf6af998 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -261,6 +261,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-DEFAULT-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index f595dfe1d6845..c865d77c86d77 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -129,6 +129,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
+; CHECK-O23SZ-NEXT: Running pass: DropUnnecessaryAssumesPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
 ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 012a1ab5802b5..c1d8b42505c84 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -180,6 +180,7 @@
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-POSTLINK-O-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index e021ff3124b60..45f090252eaf7 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -165,6 +165,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 20f94bc2e0f6c..4c330f44d30cc 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -174,6 +174,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running pass: DropUnnecessaryAssumesPass
 ; CHECK-O-NEXT: Running pass: InferAlignmentPass
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll
new file mode 100644
index 0000000000000..43fa08c070828
--- /dev/null
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/dereferenceable.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='drop-unnecessary-assumes' -S %s | FileCheck %s
+; RUN: opt -passes='drop-unnecessary-assumes<drop-deref>' -S %s | FileCheck --check-prefix=DROP-DEREF %s
+
+declare void @use(ptr)
+
+define i8 @test_dereferenceable_assume_ptr_not_used(ptr %p, i64 %size) {
+; CHECK-LABEL: define i8 @test_dereferenceable_assume_ptr_not_used(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]) ]
+; CHECK-NEXT:    ret i8 0
+;
+; DROP-DEREF-LABEL: define i8 @test_dereferenceable_assume_ptr_not_used(
+; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; DROP-DEREF-NEXT:    ret i8 0
+;
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size) ]
+  ret i8 0
+}
+
+define i8 @test_dereferenceable_assume_ptr_used_variable_size(ptr %p, i64 %size) {
+; CHECK-LABEL: define i8 @test_dereferenceable_assume_ptr_used_variable_size(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+; DROP-DEREF-LABEL: define i8 @test_dereferenceable_assume_ptr_used_variable_size(
+; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; DROP-DEREF-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; DROP-DEREF-NEXT:    ret i8 [[VAL]]
+;
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size) ]
+  %val = load i8, ptr %p
+  ret i8 %val
+}
+
+define i8 @test_dereferenceable_with_align_ptr_used(ptr %p, i64 %size) {
+; CHECK-LABEL: define i8 @test_dereferenceable_with_align_ptr_used(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 [[SIZE]]), "align"(ptr [[P]], i64 8) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+; DROP-DEREF-LABEL: define i8 @test_dereferenceable_with_align_ptr_used(
+; DROP-DEREF-SAME: ptr [[P:%.*]], i64 [[SIZE:%.*]]) {
+; DROP-DEREF-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P]], i64 8) ]
+; DROP-DEREF-NEXT:    [[VAL:%.*]] = load i8, ptr [[P]], align 1
+; DROP-DEREF-NEXT:    ret i8 [[VAL]]
+;
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %size), "align"(ptr %p, i64 8) ]
+  %val = load i8, ptr %p
+  ret i8 %val
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index e3765ed541e7a..75276c0412647 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -106,23 +106,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP4]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <2 x i64> [[TMP8]], splat (i64 225)
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225)
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP17]], align 8, !alias.scope [[META0:![0-9]+]]
@@ -182,23 +165,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.1:
 ; CHECK-NEXT:    [[INDEX_1:%.*]] = phi i64 [ 0, [[VECTOR_PH_1]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_1]] ]
 ; CHECK-NEXT:    [[TMP33:%.*]] = add nuw nsw i64 [[INDEX_1]], 15
-; CHECK-NEXT:    [[TMP34:%.*]] = add nuw nsw i64 [[INDEX_1]], 16
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <2 x i64> poison, i64 [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i64> [[TMP35]], i64 [[TMP34]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = add nuw nsw i64 [[INDEX_1]], 17
-; CHECK-NEXT:    [[TMP38:%.*]] = add nuw nsw i64 [[INDEX_1]], 18
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], splat (i64 225)
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP41]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP41]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225)
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i1> [[TMP42]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP43]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP44]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP45]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP46]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP47]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr [[TMP47]], align 8, !alias.scope [[META0]]
@@ -259,23 +225,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.2:
 ; CHECK-NEXT:    [[INDEX_2:%.*]] = phi i64 [ 0, [[VECTOR_PH_2]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY_2]] ]
 ; CHECK-NEXT:    [[TMP64:%.*]] = add nuw nsw i64 [[INDEX_2]], 30
-; CHECK-NEXT:    [[TMP65:%.*]] = add nuw nsw i64 [[INDEX_2]], 31
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <2 x i64> poison, i64 [[TMP64]], i64 0
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <2 x i64> [[TMP66]], i64 [[TMP65]], i64 1
-; CHECK-NEXT:    [[TMP68:%.*]] = add nuw nsw i64 [[INDEX_2]], 32
-; CHECK-NEXT:    [[TMP69:%.*]] = add nuw nsw i64 [[INDEX_2]], 33
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1
-; CHECK-NEXT:    [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], splat (i64 225)
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i1> [[TMP72]], i64 0
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i1> [[TMP72]], i64 1
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225)
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i1> [[TMP73]], i64 0
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i1> [[TMP73]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP74]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP75]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP76]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP77]])
 ; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP64]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP78]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr [[TMP78]], align 8, !alias.scope [[META0]]
@@ -336,23 +285,6 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       vector.body.3:
 ; CHECK-NEXT:    [[INDEX_3:%.*]] = phi i64 [ 0, [[VECTOR_PH_3]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY_3]] ]
 ; CHECK-NEXT:    [[TMP95:%.*]] = add nuw nsw i64 [[INDEX_3]], 45
-; CHECK-NEXT:    [[TMP96:%.*]] = add nuw nsw i64 [[INDEX_3]], 46
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i64> poison, i64 [[TMP95]], i64 0
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <2 x i64> [[TMP97]], i64 [[TMP96]], i64 1
-; CHECK-NEXT:    [[TMP99:%.*]] = add nuw nsw i64 [[INDEX_3]], 47
-; CHECK-NEXT:    [[TMP100:%.*]] = add nuw nsw i64 [[INDEX_3]], 48
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1
-; CHECK-NEXT:    [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], splat (i64 225)
-; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <2 x i1> [[TMP103]], i64 0
-; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <2 x i1> [[TMP103]], i64 1
-; CHECK-NEXT:    [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225)
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i1> [[TMP104]], i64 0
-; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i1> [[TMP104]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP105]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP106]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP107]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP108]])
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP95]]
 ; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP109]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr [[TMP109]], align 8, !alias.scope [[META0]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
index e9149795954ec..fd7b75f22cb6d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -8,7 +8,6 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -133,15 +132,14 @@ define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
 ; CHECK-LABEL: define noundef ptr @std_find_caller(
 ; CHECK-SAME: ptr noundef [[FIRST:%.*]], ptr noundef [[LAST:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[FIRST3:%.*]] = ptrtoint ptr [[FIRST]] to i64
-; CHECK-NEXT:    [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT:    [[PTR_SUB:%.*]] = sub i64 [[LAST_I64]], [[FIRST3]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST]], i64 2) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[LAST]], i64 2) ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST]], i64 [[PTR_SUB]]) ]
 ; CHECK-NEXT:    [[PRE_I:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
 ; CHECK-NEXT:    br i1 [[PRE_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT:.*]], label %[[LOOP_HEADER_I_PREHEADER:.*]]
 ; CHECK:       [[LOOP_HEADER_I_PREHEADER]]:
+; CHECK-NEXT:    [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT:    [[FIRST3:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT:    [[PTR_SUB:%.*]] = sub i64 [[LAST_I64]], [[FIRST3]]
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[PTR_SUB]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST3]]

From 0e6c8daabb76fe40e4e60e7e82a907648412e3cd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 10 Nov 2025 10:57:29 +0000
Subject: [PATCH 21/24] [X86] ldexp-avx512.ll - add v8f16/v16f16/v32f16 test
 coverage for #165694 (#167294)

---
 llvm/test/CodeGen/X86/ldexp-avx512.ll | 1288 ++++++++++++++++++++++++-
 1 file changed, 1285 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll
index ea93a911a1ad0..21491bc2cc8f5 100644
--- a/llvm/test/CodeGen/X86/ldexp-avx512.ll
+++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll
@@ -47,6 +47,187 @@ entry:
 }
 declare fp128 @ldexpl(fp128, i32) memory(none)
 
+define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_8xhalf:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $88, %rsp
+; AVX512-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm0
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: test_ldexp_8xhalf:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    subq $88, %rsp
+; AVX512VL-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm2, %xmm0
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX512VL-NEXT:    addq $88, %rsp
+; AVX512VL-NEXT:    retq
+  %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp)
+  ret <8 x half> %r
+}
+declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
+
 define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind {
 ; CHECK-LABEL: test_ldexp_4xfloat:
 ; CHECK:       # %bb.0:
@@ -109,6 +290,381 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi
 }
 declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
 
+define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_16xhalf:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $168, %rsp
+; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX512-NEXT:    addq $168, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: test_ldexp_16xhalf:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    subq $168, %rsp
+; AVX512VL-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512VL-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX512VL-NEXT:    addq $168, %rsp
+; AVX512VL-NEXT:    retq
+  %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp)
+  ret <16 x half> %r
+}
+declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>)
+
 define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind {
 ; CHECK-LABEL: test_ldexp_8xfloat:
 ; CHECK:       # %bb.0:
@@ -230,6 +786,735 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi
 }
 declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
 
+define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_32xhalf:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    movswl %ax, %edi
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq ldexpf@PLT
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; AVX512-NEXT:    addq $360, %rsp # imm = 0x168
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: test_ldexp_32xhalf:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    subq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[3,3,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512VL-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,0]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT:    vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512VL-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    movswl %ax, %edi
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    callq ldexpf@PLT
+; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512VL-NEXT:    vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512VL-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX512VL-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512VL-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; AVX512VL-NEXT:    # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; AVX512VL-NEXT:    addq $360, %rsp # imm = 0x168
+; AVX512VL-NEXT:    retq
+  %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp)
+  ret <32 x half> %r
+}
+declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>)
+
 define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind {
 ; CHECK-LABEL: test_ldexp_16xfloat:
 ; CHECK:       # %bb.0:
@@ -462,6 +1747,3 @@ define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwi
 }
 declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512: {{.*}}
-; AVX512VL: {{.*}}

From 54d86df4a22c226a957e87375b91db478c641d60 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 10 Nov 2025 11:11:51 +0000
Subject: [PATCH 22/24] [AArch64] Fallback to PRFUM for PRFM with negative or
 unaligned offset (#166756)

Section C3.2.2 (quoted below) in the ARMARM makes this a requirement of
assemblers for load/stores with unscaled offset. It makes no mention of
PRFM so I don't consider this to be a bug, although I can see why we
would want to extend this behaviour to the unscaled variants of these
instructions as well, as GCC does. This patch adds an alias for this.

C3.2.2 Load/store register (unscaled offset)

  The load/store register instructions with an unscaled offset support
  only one addressing mode:

      Base plus an unscaled 9-bit signed immediate offset.

  See Load/store addressing modes.

  The load/store register (unscaled offset) instructions are required to
  disambiguate this instruction class from the load/store register
instruction forms that support an addressing mode of base plus a scaled,
unsigned 12-bit immediate offset, because that can represent some offset
  values in the same range.

  The ambiguous immediate offsets are byte offsets that are both:

      In the range 0-255, inclusive.

      Naturally aligned to the access size.

  Other byte offsets in the range -256 to 255 inclusive are unambiguous.
  An assembler program translating a load/store instruction, for example
  LDR, is required to encode an unambiguous offset using the unscaled
  9-bit offset form, and to encode an ambiguous offset using the scaled
  12-bit offset form. A programmer might force the generation of the
  unscaled 9-bit form by using one of the mnemonics in Table C.3.21. Arm
  recommends that a disassembler outputs all unscaled 9-bit offset forms
  using one of these mnemonics, but unambiguous offsets can be output
  using a load/store single register mnemonic, for example, LDR.

Fixes #83226.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td |  5 +++
 llvm/test/MC/AArch64/prfum.s                | 44 +++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 llvm/test/MC/AArch64/prfum.s

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 76f076a60765f..b30e3d06b2c9f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4444,6 +4444,11 @@ defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
                   [(AArch64Prefetch timm:$Rt,
                                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
 
+// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple
+// of 8).
+def : InstAlias<"prfm $Rt, [$Rn, $offset]",
+                (PRFUMi prfop:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+
 //---
 // (unscaled immediate, unprivileged)
 defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
diff --git a/llvm/test/MC/AArch64/prfum.s b/llvm/test/MC/AArch64/prfum.s
new file mode 100644
index 0000000000000..81a864a694325
--- /dev/null
+++ b/llvm/test/MC/AArch64/prfum.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding --print-imm-hex=false < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj < %s \
+// RUN:        | llvm-objdump -d --print-imm-hex=false - | FileCheck %s --check-prefix=CHECK-INST
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -disassemble -show-encoding --print-imm-hex=false \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// PRFM falls back to PRFUM for negative or unaligned offsets (not a multiple
+// of 8).
+
+prfm pldl1keep, [x0, #-256]
+// CHECK-INST: prfum pldl1keep, [x0, #-256]
+// CHECK-ENCODING: [0x00,0x00,0x90,0xf8]
+
+prfm pldl1keep, [x0, #-8]
+// CHECK-INST: prfum pldl1keep, [x0, #-8]
+// CHECK-ENCODING: [0x00,0x80,0x9f,0xf8]
+
+prfm pldl1keep, [x0, #-1]
+// CHECK-INST: prfum pldl1keep, [x0, #-1]
+// CHECK-ENCODING: [0x00,0xf0,0x9f,0xf8]
+
+prfm pldl1keep, [x0, #0]
+// CHECK-INST: prfm pldl1keep, [x0]
+// CHECK-ENCODING: [0x00,0x00,0x80,0xf9]
+
+prfm pldl1keep, [x0, #1]
+// CHECK-INST: prfum pldl1keep, [x0, #1]
+// CHECK-ENCODING: [0x00,0x10,0x80,0xf8]
+
+prfm pldl1keep, [x0, #8]
+// CHECK-INST: prfm pldl1keep, [x0, #8]
+// CHECK-ENCODING: [0x00,0x04,0x80,0xf9]
+
+prfm pldl1keep, [x0, #255]
+// CHECK-INST: prfum pldl1keep, [x0, #255]
+// CHECK-ENCODING: [0x00,0xf0,0x8f,0xf8]
+
+prfm pldl1keep, [x0, #256]
+// CHECK-INST: prfm pldl1keep, [x0, #256]
+// CHECK-ENCODING: [0x00,0x80,0x80,0xf9]

From b18d828eeae03cc9c42edf4d72911c75f117397c Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Mon, 10 Nov 2025 13:51:07 +0200
Subject: [PATCH 23/24] [tools][llc] Make save-stats.ll test target independent
 (#167238)

---
 llvm/test/tools/llc/save-stats.ll | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll
index acb0367195043..4950625c809cc 100644
--- a/llvm/test/tools/llc/save-stats.ll
+++ b/llvm/test/tools/llc/save-stats.ll
@@ -1,10 +1,9 @@
 ; REQUIRES: asserts
-; REQUIRES: aarch64-registered-target
 
-; RUN: llc -mtriple=arm64-apple-macosx --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s
-; RUN: llc -mtriple=arm64-apple-macosx --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
-; RUN: llc -mtriple=arm64-apple-macosx --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
-; RUN: not llc -mtriple=arm64-apple-macosx --save-stats=invalid -o %t.s %s 2>&1 | FileCheck %s --check-prefix=INVALID_ARG
+; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s
+; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
+; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s
+; RUN: not llc --save-stats=invalid -o %t.s %s 2>&1 | FileCheck %s --check-prefix=INVALID_ARG
 
 ; CHECK: {
 ; CHECK: "asm-printer.EmittedInsts":

From 1ffe79d092909a2075705a10d932f0af0825577b Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Mon, 10 Nov 2025 20:04:50 +0800
Subject: [PATCH 24/24] [libc++] Avoid overloaded `operator,` for (`T`, `Iter`)
 cases (#161049)

Several components in libc++ aren't defending against overloaded
`operator,(T, Iter)` currently. Existing deleted overloads in
`test_iterators.h` are insufficient for such cases.

This PR adds corresponding deleted overloads with reversed order and
fixes these libc++ components.
- `piecewise_linear_distribution`'s iterator pair constructor,
- `piecewise_linear_distribution::param_type`'s iterator pair
constructor,
- `piecewise_constant_distribution`'s iterator pair constructor,
- `piecewise_constant_distribution::param_type`'s iterator pair
constructor,
- `money_get::do_get`,
- `money_put::do_put`, and
- `num_put::do_put`.
---
 libcxx/include/__locale_dir/money.h           |  4 +-
 libcxx/include/__locale_dir/num.h             |  5 +-
 libcxx/include/__locale_dir/pad_and_output.h  | 11 ++-
 .../piecewise_constant_distribution.h         |  5 +-
 .../__random/piecewise_linear_distribution.h  |  5 +-
 .../deque/deque.cons/iter_iter.pass.cpp       |  5 +-
 .../deque/deque.cons/iter_iter_alloc.pass.cpp |  5 +-
 .../vector.bool/construct_iter_iter.pass.cpp  |  4 +-
 .../construct_iter_iter_alloc.pass.cpp        |  4 +-
 .../vector.cons/construct_iter_iter.pass.cpp  |  4 +-
 .../construct_iter_iter_alloc.pass.cpp        |  4 +-
 .../get_long_double_en_US.pass.cpp            |  1 +
 .../get_long_double_fr_FR.pass.cpp            |  1 +
 .../get_long_double_overlong.pass.cpp         |  2 +
 .../get_long_double_ru_RU.pass.cpp            |  1 +
 .../get_long_double_zh_CN.pass.cpp            |  1 +
 .../get_string_en_US.pass.cpp                 |  1 +
 .../put_long_double_en_US.pass.cpp            |  1 +
 .../put_long_double_fr_FR.pass.cpp            |  2 +
 .../put_long_double_ru_RU.pass.cpp            |  2 +
 .../put_long_double_zh_CN.pass.cpp            |  1 +
 .../put_string_en_US.pass.cpp                 |  2 +
 .../facet.num.put.members/put_bool.pass.cpp   |  2 +
 .../put_double.hex.pass.cpp                   |  1 +
 .../facet.num.put.members/put_double.pass.cpp |  1 +
 .../facet.num.put.members/put_long.pass.cpp   |  2 +
 .../put_long_double.hex.pass.cpp              |  1 +
 .../put_long_double.pass.cpp                  |  1 +
 .../put_long_long.pass.cpp                    |  2 +
 .../put_pointer.pass.cpp                      |  2 +
 .../put_unsigned_long.pass.cpp                |  2 +
 .../put_unsigned_long_long.pass.cpp           |  2 +
 .../ctor_iterator.pass.cpp                    |  6 +-
 .../param_ctor_iterator.pass.cpp              |  6 +-
 .../ctor_iterator.pass.cpp                    |  6 +-
 .../param_ctor_iterator.pass.cpp              |  6 +-
 libcxx/test/support/test_iterators.h          | 68 +++++++++++++++----
 37 files changed, 133 insertions(+), 46 deletions(-)

diff --git a/libcxx/include/__locale_dir/money.h b/libcxx/include/__locale_dir/money.h
index c1296665505e1..12ba38467d805 100644
--- a/libcxx/include/__locale_dir/money.h
+++ b/libcxx/include/__locale_dir/money.h
@@ -433,7 +433,7 @@ bool money_get<_CharT, _InputIterator>::__do_get(
           __err |= ios_base::failbit;
           return false;
         }
-        for (++__b; __fd > 0; --__fd, ++__b) {
+        for (++__b; __fd > 0; --__fd, (void)++__b) {
           if (__b == __e || !__ct.is(ctype_base::digit, *__b)) {
             __err |= ios_base::failbit;
             return false;
@@ -451,7 +451,7 @@ bool money_get<_CharT, _InputIterator>::__do_get(
     }
   }
   if (__trailing_sign) {
-    for (unsigned __i = 1; __i < __trailing_sign->size(); ++__i, ++__b) {
+    for (unsigned __i = 1; __i < __trailing_sign->size(); ++__i, (void)++__b) {
       if (__b == __e || *__b != (*__trailing_sign)[__i]) {
         __err |= ios_base::failbit;
         return false;
diff --git a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h
index 7ca8ffe348959..ff357cd2d97db 100644
--- a/libcxx/include/__locale_dir/num.h
+++ b/libcxx/include/__locale_dir/num.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___LOCALE_DIR_NUM_H
 #define _LIBCPP___LOCALE_DIR_NUM_H
 
+#include <__algorithm/copy.h>
 #include <__algorithm/find.h>
 #include <__algorithm/reverse.h>
 #include <__charconv/to_chars_integral.h>
@@ -885,9 +886,7 @@ num_put<_CharT, _OutputIterator>::do_put(iter_type __s, ios_base& __iob, char_ty
   const numpunct<char_type>& __np = std::use_facet<numpunct<char_type> >(__iob.getloc());
   typedef typename numpunct<char_type>::string_type string_type;
   string_type __nm = __v ? __np.truename() : __np.falsename();
-  for (typename string_type::iterator __i = __nm.begin(); __i != __nm.end(); ++__i, ++__s)
-    *__s = *__i;
-  return __s;
+  return std::copy(__nm.begin(), __nm.end(), __s);
 }
 
 template <class _CharT, class _OutputIterator>
diff --git a/libcxx/include/__locale_dir/pad_and_output.h b/libcxx/include/__locale_dir/pad_and_output.h
index a1cb37d0786da..bdd4d2856dad6 100644
--- a/libcxx/include/__locale_dir/pad_and_output.h
+++ b/libcxx/include/__locale_dir/pad_and_output.h
@@ -13,6 +13,8 @@
 
 #if _LIBCPP_HAS_LOCALIZATION
 
+#  include <__algorithm/copy.h>
+#  include <__algorithm/fill_n.h>
 #  include <ios>
 
 #  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -30,12 +32,9 @@ _LIBCPP_HIDE_FROM_ABI _OutputIterator __pad_and_output(
     __ns -= __sz;
   else
     __ns = 0;
-  for (; __ob < __op; ++__ob, ++__s)
-    *__s = *__ob;
-  for (; __ns; --__ns, ++__s)
-    *__s = __fl;
-  for (; __ob < __oe; ++__ob, ++__s)
-    *__s = *__ob;
+  __s = std::copy(__ob, __op, __s);
+  __s = std::fill_n(__s, __ns, __fl);
+  __s = std::copy(__op, __oe, __s);
   __iob.width(0);
   return __s;
 }
diff --git a/libcxx/include/__random/piecewise_constant_distribution.h b/libcxx/include/__random/piecewise_constant_distribution.h
index c5bfa8dc3a4be..3faf339325f74 100644
--- a/libcxx/include/__random/piecewise_constant_distribution.h
+++ b/libcxx/include/__random/piecewise_constant_distribution.h
@@ -9,9 +9,11 @@
 #ifndef _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H
 #define _LIBCPP___RANDOM_PIECEWISE_CONSTANT_DISTRIBUTION_H
 
+#include <__algorithm/copy_n.h>
 #include <__algorithm/upper_bound.h>
 #include <__config>
 #include <__cstddef/ptrdiff_t.h>
+#include <__iterator/back_insert_iterator.h>
 #include <__random/is_valid.h>
 #include <__random/uniform_real_distribution.h>
 #include <__vector/vector.h>
@@ -190,8 +192,7 @@ piecewise_constant_distribution<_RealType>::param_type::param_type(
     __areas_.assign(1, 0.0);
   } else {
     __densities_.reserve(__b_.size() - 1);
-    for (size_t __i = 0; __i < __b_.size() - 1; ++__i, ++__f_w)
-      __densities_.push_back(*__f_w);
+    std::copy_n(__f_w, __b_.size() - 1, std::back_inserter(__densities_));
     __init();
   }
 }
diff --git a/libcxx/include/__random/piecewise_linear_distribution.h b/libcxx/include/__random/piecewise_linear_distribution.h
index a9906430c005c..8aa3f19ca9004 100644
--- a/libcxx/include/__random/piecewise_linear_distribution.h
+++ b/libcxx/include/__random/piecewise_linear_distribution.h
@@ -9,9 +9,11 @@
 #ifndef _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H
 #define _LIBCPP___RANDOM_PIECEWISE_LINEAR_DISTRIBUTION_H
 
+#include <__algorithm/copy_n.h>
 #include <__algorithm/upper_bound.h>
 #include <__config>
 #include <__cstddef/ptrdiff_t.h>
+#include <__iterator/back_insert_iterator.h>
 #include <__random/is_valid.h>
 #include <__random/uniform_real_distribution.h>
 #include <__vector/comparison.h>
@@ -194,8 +196,7 @@ piecewise_linear_distribution<_RealType>::param_type::param_type(
     __areas_.assign(1, 0.0);
   } else {
     __densities_.reserve(__b_.size());
-    for (size_t __i = 0; __i < __b_.size(); ++__i, ++__f_w)
-      __densities_.push_back(*__f_w);
+    std::copy_n(__f_w, __b_.size(), std::back_inserter(__densities_));
     __init();
   }
 }
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp
index 1f8a044d0b602..59d93ac7ea411 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter.pass.cpp
@@ -11,6 +11,7 @@
 // template <class InputIterator> deque(InputIterator f, InputIterator l);
 
 #include "asan_testing.h"
+#include <algorithm>
 #include <deque>
 #include <cassert>
 #include <cstddef>
@@ -28,13 +29,11 @@ void test(InputIterator f, InputIterator l) {
   typedef typename std::iterator_traits<InputIterator>::value_type T;
   typedef std::allocator<T> Allocator;
   typedef std::deque<T, Allocator> C;
-  typedef typename C::const_iterator const_iterator;
   C d(f, l);
   assert(d.size() == static_cast<std::size_t>(std::distance(f, l)));
   assert(static_cast<std::size_t>(std::distance(d.begin(), d.end())) == d.size());
   LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(d));
-  for (const_iterator i = d.begin(), e = d.end(); i != e; ++i, ++f)
-    assert(*i == *f);
+  assert(std::equal(d.begin(), d.end(), f));
 }
 
 template <class Allocator, class InputIterator>
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp
index 61318c3d0f2d3..ef876bb272fc7 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/iter_iter_alloc.pass.cpp
@@ -12,6 +12,7 @@
 //   deque(InputIterator f, InputIterator l, const allocator_type& a);
 
 #include "asan_testing.h"
+#include <algorithm>
 #include <deque>
 #include <cassert>
 #include <cstddef>
@@ -28,14 +29,12 @@ template <class InputIterator, class Allocator>
 void test(InputIterator f, InputIterator l, const Allocator& a) {
   typedef typename std::iterator_traits<InputIterator>::value_type T;
   typedef std::deque<T, Allocator> C;
-  typedef typename C::const_iterator const_iterator;
   C d(f, l, a);
   assert(d.get_allocator() == a);
   assert(d.size() == static_cast<std::size_t>(std::distance(f, l)));
   assert(static_cast<std::size_t>(std::distance(d.begin(), d.end())) == d.size());
   LIBCPP_ASSERT(is_double_ended_contiguous_container_asan_correct(d));
-  for (const_iterator i = d.begin(), e = d.end(); i != e; ++i, ++f)
-    assert(*i == *f);
+  assert(std::equal(d.begin(), d.end(), f));
 }
 
 void basic_test() {
diff --git a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp
index e9fb2e6ecfbac..b862583c495e1 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter.pass.cpp
@@ -11,6 +11,7 @@
 
 // template <class InputIter> vector(InputIter first, InputIter last);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -24,8 +25,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last) {
   C c(first, last);
   LIBCPP_ASSERT(c.__invariants());
   assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
-  for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-    assert(*i == *first);
+  assert(std::equal(c.cbegin(), c.cend(), first));
 }
 
 TEST_CONSTEXPR_CXX20 bool tests() {
diff --git a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp
index 71a176a0a64ba..3fe462eef80ed 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/construct_iter_iter_alloc.pass.cpp
@@ -12,6 +12,7 @@
 // template <class InputIter> vector(InputIter first, InputIter last,
 //                                   const allocator_type& a);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -25,8 +26,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last, const typename C::
   C c(first, last, a);
   LIBCPP_ASSERT(c.__invariants());
   assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
-  for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-    assert(*i == *first);
+  assert(std::equal(c.cbegin(), c.cend(), first));
 }
 
 TEST_CONSTEXPR_CXX20 bool tests() {
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp
index 1a6364a8018bc..f2ac013987eb8 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter.pass.cpp
@@ -10,6 +10,7 @@
 
 // template <class InputIter> vector(InputIter first, InputIter last);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -31,8 +32,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last) {
     LIBCPP_ASSERT(c.__invariants());
     assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
     LIBCPP_ASSERT(is_contiguous_container_asan_correct(c));
-    for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-      assert(*i == *first);
+    assert(std::equal(c.cbegin(), c.cend(), first));
   }
   // Test with an empty range
   {
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp
index d1eff51011c4f..56a3778ddf965 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/construct_iter_iter_alloc.pass.cpp
@@ -11,6 +11,7 @@
 // template <class InputIter> vector(InputIter first, InputIter last,
 //                                   const allocator_type& a);
 
+#include <algorithm>
 #include <vector>
 #include <cassert>
 #include <cstddef>
@@ -31,8 +32,7 @@ TEST_CONSTEXPR_CXX20 void test(Iterator first, Iterator last, const A& a) {
   LIBCPP_ASSERT(c.__invariants());
   assert(c.size() == static_cast<std::size_t>(std::distance(first, last)));
   LIBCPP_ASSERT(is_contiguous_container_asan_correct(c));
-  for (typename C::const_iterator i = c.cbegin(), e = c.cend(); i != e; ++i, ++first)
-    assert(*i == *first);
+  assert(std::equal(c.cbegin(), c.cend(), first));
 }
 
 #if TEST_STD_VER >= 11
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp
index 9997b07134563..9861662bb59c7 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_en_US.pass.cpp
@@ -15,6 +15,7 @@
 
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
index c9ed59f3cb9aa..002fc4b1ec7ef 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_fr_FR.pass.cpp
@@ -8,6 +8,7 @@
 
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.fr_FR.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp
index 0b7a38e5104cd..8fe74cdaca5e4 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_overlong.pass.cpp
@@ -16,6 +16,8 @@
 // Ensure that money_get::do_get correct works when the input doesn't fit into the stack buffer
 // (100 characters currently).
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <cassert>
 #include <cstddef>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
index 371cf0e90c8d3..7ce267d0617b0 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_ru_RU.pass.cpp
@@ -14,6 +14,7 @@
 // ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
 
 // XFAIL: glibc-old-ru_RU-decimal-point
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // <locale>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
index c86df7e6b53bf..d83167d1ee458 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_long_double_zh_CN.pass.cpp
@@ -10,6 +10,7 @@
 // XFAIL: netbsd
 
 // XFAIL: LIBCXX-FREEBSD-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.zh_CN.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp
index 478df7964f6d2..0531260487b9f 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.get/locale.money.get.members/get_string_en_US.pass.cpp
@@ -15,6 +15,7 @@
 
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp
index 4b767fae871fa..0f2c81a805282 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_en_US.pass.cpp
@@ -15,6 +15,7 @@
 
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.en_US.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
index f9d7998b07ff4..733eea94fd9bd 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_fr_FR.pass.cpp
@@ -9,6 +9,8 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // REQUIRES: locale.fr_FR.UTF-8
 
 // ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
index be1e397488468..24cc4fdb47f75 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_ru_RU.pass.cpp
@@ -9,6 +9,8 @@
 // NetBSD does not support LC_MONETARY at the moment
 // XFAIL: netbsd
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // REQUIRES: locale.ru_RU.UTF-8
 
 // ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
index 25046a8417083..d970b55eb704b 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_long_double_zh_CN.pass.cpp
@@ -10,6 +10,7 @@
 // XFAIL: netbsd
 
 // XFAIL: LIBCXX-FREEBSD-FIXME
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // REQUIRES: locale.zh_CN.UTF-8
 
diff --git a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp
index 1c8710a008f27..9770912da9dcf 100644
--- a/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.monetary/locale.money.put/locale.money.put.members/put_string_en_US.pass.cpp
@@ -16,6 +16,8 @@
 // Bionic has minimal locale support, investigate this later.
 // XFAIL: LIBCXX-ANDROID-FIXME
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 // REQUIRES: locale.en_US.UTF-8
 
 #include <locale>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp
index d62a27a0f6ae9..22997ebbbc82d 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_bool.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, bool v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
index dea2be771e0c6..a4ef158954f59 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.hex.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
 // XFAIL: win32-broken-printf-a-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
index b131a41ceac34..45ede5a395c63 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_double.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, double v) const;
 
 // XFAIL: win32-broken-printf-g-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp
index 7f034d487e57e..c3565c5bab11d 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
index 8db40b9e0dcbc..9e84fa8a53afe 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.hex.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
 // XFAIL: win32-broken-printf-a-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
index d044898a1f828..e2868cfb37140 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_double.pass.cpp
@@ -13,6 +13,7 @@
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long double v) const;
 
 // XFAIL: win32-broken-printf-g-precision
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <locale>
 #include <ios>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp
index 2f4dd42e1a20c..4f60835880422 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_long_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, long long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
index fed5b4a610fd4..57607e6d6a521 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, void* v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <cassert>
 #include <ios>
 #include <locale>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp
index 714c8dd8ccd9f..11216a3d111e3 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, unsigned long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp
index 70ae4b3ae9de0..5dd555eda1e56 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_unsigned_long_long.pass.cpp
@@ -12,6 +12,8 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, unsigned long long v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp
index ea6e807ca47b5..400cfd78d94a3 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/ctor_iterator.pass.cpp
@@ -16,20 +16,24 @@
 //                                     InputIteratorB lastB,
 //                                     InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
 {
     {
         typedef std::piecewise_constant_distribution<> D;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        D d(b, b, p);
+        D d((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = d.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp
index baf6108b7e2e8..8b3e21fc0932e 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.pconst/param_ctor_iterator.pass.cpp
@@ -15,11 +15,14 @@
 //     param_type(InputIteratorB firstB, InputIteratorB lastB,
 //                InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
@@ -27,9 +30,10 @@ int main(int, char**)
     {
         typedef std::piecewise_constant_distribution<> D;
         typedef D::param_type P;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        P pa(b, b, p);
+        P pa((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = pa.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp
index 24f7d4e18c36a..8ed56ecdd31e9 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/ctor_iterator.pass.cpp
@@ -16,20 +16,24 @@
 //                                     InputIteratorB lastB,
 //                                     InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
 {
     {
         typedef std::piecewise_linear_distribution<> D;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        D d(b, b, p);
+        D d((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = d.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp
index 04ded2a1c9706..272d0b4c87459 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.plinear/param_ctor_iterator.pass.cpp
@@ -15,11 +15,14 @@
 //     param_type(InputIteratorB firstB, InputIteratorB lastB,
 //                InputIteratorW firstW);
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <random>
 
 #include <cassert>
 #include <vector>
 
+#include "test_iterators.h"
 #include "test_macros.h"
 
 int main(int, char**)
@@ -27,9 +30,10 @@ int main(int, char**)
     {
         typedef std::piecewise_linear_distribution<> D;
         typedef D::param_type P;
+        typedef cpp17_input_iterator<const double*> InIt;
         double b[] = {10};
         double p[] = {12};
-        P pa(b, b, p);
+        P pa((InIt(b)), (InIt(b)), (InIt(p)));
         std::vector<double> iv = pa.intervals();
         assert(iv.size() == 2);
         assert(iv[0] == 0);
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 0335a4c561017..4fc8345c2dcef 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -59,6 +59,9 @@ class cpp17_output_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const cpp17_output_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -109,6 +112,9 @@ class cpp17_input_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const cpp17_input_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -157,6 +163,9 @@ class forward_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const forward_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -203,6 +212,9 @@ class bidirectional_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const bidirectional_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -261,6 +273,9 @@ class random_access_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const random_access_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -390,6 +405,9 @@ class three_way_random_access_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const three_way_random_access_iterator&) = delete;
 };
 #if TEST_STD_VER > 14
 template <class It>
@@ -485,6 +503,9 @@ class cpp20_random_access_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const cpp20_random_access_iterator&) = delete;
 };
 template <class It>
 cpp20_random_access_iterator(It) -> cpp20_random_access_iterator<It>;
@@ -578,6 +599,9 @@ class contiguous_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const contiguous_iterator&) = delete;
 };
 template <class It>
 contiguous_iterator(It) -> contiguous_iterator<It>;
@@ -635,6 +659,9 @@ class three_way_contiguous_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const three_way_contiguous_iterator&) = delete;
 };
 template <class It>
 three_way_contiguous_iterator(It) -> three_way_contiguous_iterator<It>;
@@ -746,7 +773,10 @@ struct ThrowingIterator {
     template <class T2>
     void operator,(T2 const &) = delete;
 
-private:
+    template <class T2>
+    friend void operator,(const T2&, const ThrowingIterator&) = delete;
+
+  private:
     const T* begin_;
     const T* end_;
     const T* current_;
@@ -817,7 +847,10 @@ struct NonThrowingIterator {
     template <class T2>
     void operator,(T2 const &) = delete;
 
-private:
+    template <class T2>
+    friend void operator,(const T2&, const NonThrowingIterator&) = delete;
+
+  private:
     const T *begin_;
     const T *end_;
     const T *current_;
@@ -847,6 +880,9 @@ class cpp20_input_iterator
 
     template <class T>
     void operator,(T const &) = delete;
+
+    template <class T>
+    friend void operator,(const T&, const cpp20_input_iterator&) = delete;
 };
 template <class It>
 cpp20_input_iterator(It) -> cpp20_input_iterator<It>;
@@ -884,6 +920,9 @@ class cpp20_output_iterator {
 
   template <class T>
   void operator,(T const&) = delete;
+
+  template <class T>
+  friend void operator,(const T&, const cpp20_output_iterator&) = delete;
 };
 template <class It>
 cpp20_output_iterator(It) -> cpp20_output_iterator<It>;
@@ -1077,17 +1116,20 @@ class operation_counting_iterator {
     template <class T>
     void operator,(T const &) = delete;
 
-private:
-  constexpr void moved_by(difference_type n) {
-    if (counts_ == nullptr)
-      return;
-    if (n > 0)
-      ++counts_->increments;
-    else if (n < 0)
-      ++counts_->decrements;
-    else
-      ++counts_->zero_moves;
-  }
+    template <class T>
+    friend void operator,(const T&, const operation_counting_iterator&) = delete;
+
+  private:
+    constexpr void moved_by(difference_type n) {
+      if (counts_ == nullptr)
+        return;
+      if (n > 0)
+        ++counts_->increments;
+      else if (n < 0)
+        ++counts_->decrements;
+      else
+        ++counts_->zero_moves;
+    }
 
     decltype(base(std::declval<It>())) base_;
     IteratorOpCounts* counts_ = nullptr;