From 1bd035d80f7a92a8e694d4c1f75733d41775ed44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Plewa?= <lukasz.plewa@intel.com>
Date: Thu, 13 Nov 2025 16:56:38 +0100
Subject: [PATCH 01/25] [offload] defer "---> olInit" trace message (#167893)

Tracing requires liboffload to be initialized, so calling
isTracingEnabled() before olInit always returns false. This caused the
first trace log to look like:
```
-> OL_SUCCESS
```
instead of:
```
---> olInit() -> OL_SUCCESS
```
This patch moves the pre-call trace print for olInit so it is emitted
only after initialization.

It would be possible to add extra logic to detect whether liboffload is
already initialized and only postpone the first pre-call print, but this
would add unnecessary complexity, especially since this is tablegen
code. The difference would matter only in the unlikely case of a crash
during a second olInit call.

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
---
 offload/tools/offload-tblgen/EntryPointGen.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/offload/tools/offload-tblgen/EntryPointGen.cpp b/offload/tools/offload-tblgen/EntryPointGen.cpp
index 4e42e4905b993..4f76100ed2dc3 100644
--- a/offload/tools/offload-tblgen/EntryPointGen.cpp
+++ b/offload/tools/offload-tblgen/EntryPointGen.cpp
@@ -83,13 +83,15 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
   OS << ") {\n";
 
   // Check offload is initialized
-  if (F.getName() != "olInit")
+  if (F.getName() != "olInit") {
     OS << "if (!llvm::offload::isOffloadInitialized()) return &UninitError;";
 
-  // Emit pre-call prints
-  OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n";
-  OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName());
-  OS << TAB_1 "}\n\n";
+    // Emit pre-call prints
+    // Postpone pre-calls for olInit as tracing requires liboffload to be initialized
+    OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n";
+    OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName());
+    OS << TAB_1 "}\n\n";
+  }
 
   // Perform actual function call to the validation wrapper
   ParamNameList = ParamNameList.substr(0, ParamNameList.size() - 2);
@@ -99,6 +101,10 @@ static void EmitEntryPointFunc(const FunctionRec &F, raw_ostream &OS) {
 
   // Emit post-call prints
   OS << TAB_1 "if (llvm::offload::isTracingEnabled()) {\n";
+  // postponed pre-call print for olInit
+  if (F.getName() == "olInit")
+    OS << formatv(TAB_2 "llvm::errs() << \"---> {0}\";\n", F.getName());
+
   if (F.getParams().size() > 0) {
     OS << formatv(TAB_2 "{0} Params = {{", F.getParamStructName());
     for (const auto &Param : F.getParams()) {

From a12600cac1aab81ce1d317fe802ae7cf46889d48 Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Thu, 13 Nov 2025 09:00:26 -0700
Subject: [PATCH 02/25] [flang][NFC] Strip trailing whitespace from tests (4 of
 N)

Only the fortran source files in flang/test/Intrinsics have been modified. The
other files in flang/test will be cleaned up in subsequent commits
---
 flang/test/Lower/Intrinsics/adjustl.f90       |  1 -
 flang/test/Lower/Intrinsics/adjustr.f90       |  1 -
 flang/test/Lower/Intrinsics/associated.f90    | 16 +++++++-------
 flang/test/Lower/Intrinsics/btest.f90         |  1 -
 flang/test/Lower/Intrinsics/ceiling.f90       |  2 --
 flang/test/Lower/Intrinsics/count.f90         |  4 ++--
 flang/test/Lower/Intrinsics/cpu_time.f90      |  1 -
 flang/test/Lower/Intrinsics/date_and_time.f90 |  6 ++---
 flang/test/Lower/Intrinsics/eoshift.f90       | 22 +++++++++----------
 .../execute_command_line-optional.f90         |  8 +++----
 .../Lower/Intrinsics/execute_command_line.f90 | 12 +++++-----
 flang/test/Lower/Intrinsics/exit.f90          |  2 +-
 .../test/Lower/Intrinsics/extends_type_of.f90 |  2 +-
 flang/test/Lower/Intrinsics/floor.f90         |  1 -
 .../get_command_argument-optional.f90         |  4 ++--
 flang/test/Lower/Intrinsics/ichar.f90         |  2 +-
 flang/test/Lower/Intrinsics/ishftc.f90        | 18 +++++++--------
 flang/test/Lower/Intrinsics/max.f90           | 16 +++++++-------
 flang/test/Lower/Intrinsics/maxloc.f90        |  6 ++---
 flang/test/Lower/Intrinsics/merge.f90         |  2 +-
 flang/test/Lower/Intrinsics/minloc.f90        |  6 ++---
 flang/test/Lower/Intrinsics/modulo.f90        |  2 +-
 flang/test/Lower/Intrinsics/nint.f90          |  1 -
 flang/test/Lower/Intrinsics/not.f90           |  1 -
 flang/test/Lower/Intrinsics/pack.f90          |  2 +-
 flang/test/Lower/Intrinsics/perror.f90        | 12 +++++-----
 flang/test/Lower/Intrinsics/product.f90       |  2 +-
 flang/test/Lower/Intrinsics/reduce.f90        |  6 ++---
 flang/test/Lower/Intrinsics/reshape.f90       | 10 ++++-----
 flang/test/Lower/Intrinsics/scale.f90         |  2 +-
 flang/test/Lower/Intrinsics/spread.f90        |  2 +-
 flang/test/Lower/Intrinsics/sum.f90           |  2 +-
 flang/test/Lower/Intrinsics/system.f90        |  4 ++--
 flang/test/Lower/Intrinsics/transfer.f90      |  6 ++---
 flang/test/Lower/Intrinsics/unlink-sub.f90    |  2 +-
 35 files changed, 89 insertions(+), 98 deletions(-)

diff --git a/flang/test/Lower/Intrinsics/adjustl.f90 b/flang/test/Lower/Intrinsics/adjustl.f90
index a742f58db5c48..b66a8409c083d 100644
--- a/flang/test/Lower/Intrinsics/adjustl.f90
+++ b/flang/test/Lower/Intrinsics/adjustl.f90
@@ -16,4 +16,3 @@ subroutine adjustl_test
   ! CHECK: fir.call @_FortranAAdjustl(%[[r3]], %[[r4]], %[[r5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     adjust_str = adjustl(adjust_str)
   end subroutine
-  
diff --git a/flang/test/Lower/Intrinsics/adjustr.f90 b/flang/test/Lower/Intrinsics/adjustr.f90
index a929ab17ab9ff..8e823718db9f2 100644
--- a/flang/test/Lower/Intrinsics/adjustr.f90
+++ b/flang/test/Lower/Intrinsics/adjustr.f90
@@ -16,4 +16,3 @@ subroutine adjustr_test
   ! CHECK: fir.call @_FortranAAdjustr(%[[r3]], %[[r4]], %[[r5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
     adjust_str = adjustr(adjust_str)
   end subroutine
-  
diff --git a/flang/test/Lower/Intrinsics/associated.f90 b/flang/test/Lower/Intrinsics/associated.f90
index 9308ec7cc1a15..b32e0abd9bb0a 100644
--- a/flang/test/Lower/Intrinsics/associated.f90
+++ b/flang/test/Lower/Intrinsics/associated.f90
@@ -23,11 +23,11 @@ subroutine associated_test(scalar, array)
     ! CHECK: fir.call @_FortranAPointerIsAssociatedWith(%[[sbox]], %[[zbox]]) {{.*}}: (!fir.box<none>, !fir.box<none>) -> i1
     print *, associated(scalar, ziel)
   end subroutine
-  
+
   subroutine test_func_results()
     interface
       function get_pointer()
-        real, pointer :: get_pointer(:) 
+        real, pointer :: get_pointer(:)
       end function
     end interface
     ! CHECK: %[[result:.*]] = fir.call @_QPget_pointer() {{.*}}: () -> !fir.box<!fir.ptr<!fir.array<?xf32>>>
@@ -38,7 +38,7 @@ function get_pointer()
     ! CHECK:  arith.cmpi ne, %[[addr_cast]], %c0{{.*}} : i64
     print *, associated(get_pointer())
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_optional_target_1(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "optionales_ziel", fir.optional, fir.target}) {
@@ -61,7 +61,7 @@ subroutine test_optional_target_1(p, optionales_ziel)
   ! CHECK:  %[[VAL_15:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.array<10xf32>>) -> !fir.box<none>
   ! CHECK:  fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_14]], %[[VAL_15]]) {{.*}}: (!fir.box<none>, !fir.box<none>) -> i1
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_optional_target_2(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "optionales_ziel", fir.optional, fir.target}) {
@@ -81,7 +81,7 @@ subroutine test_optional_target_2(p, optionales_ziel)
   ! CHECK:  %[[VAL_12:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
   ! CHECK:  fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_11]], %[[VAL_12]]) {{.*}}: (!fir.box<none>, !fir.box<none>) -> i1
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_optional_target_3(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "optionales_ziel", fir.optional}) {
@@ -102,7 +102,7 @@ subroutine test_optional_target_3(p, optionales_ziel)
   ! CHECK:  %[[VAL_13:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
   ! CHECK:  fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_12]], %[[VAL_13]]) {{.*}}: (!fir.box<none>, !fir.box<none>) -> i1
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_optional_target_4(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "optionales_ziel", fir.optional, fir.target}) {
@@ -123,7 +123,7 @@ subroutine test_optional_target_4(p, optionales_ziel)
   ! CHECK:  %[[VAL_13:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.box<none>
   ! CHECK:  fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_12]], %[[VAL_13]]) {{.*}}: (!fir.box<none>, !fir.box<none>) -> i1
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_pointer_target(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "pointer_ziel"}) {
@@ -137,7 +137,7 @@ subroutine test_pointer_target(p, pointer_ziel)
   ! CHECK:  %[[VAL_10:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
   ! CHECK:  fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_9]], %[[VAL_10]]) {{.*}}: (!fir.box<none>, !fir.box<none>) -> i1
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_allocatable_target(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {fir.bindc_name = "p"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {fir.bindc_name = "allocatable_ziel", fir.target}) {
diff --git a/flang/test/Lower/Intrinsics/btest.f90 b/flang/test/Lower/Intrinsics/btest.f90
index 6c0fccd0f5a9f..b10850ef0b5e6 100644
--- a/flang/test/Lower/Intrinsics/btest.f90
+++ b/flang/test/Lower/Intrinsics/btest.f90
@@ -15,4 +15,3 @@ function btest_test(i, j)
     ! CHECK: return %[[VAL_9]] : !fir.logical<4>
     btest_test = btest(i, j)
   end
-  
\ No newline at end of file
diff --git a/flang/test/Lower/Intrinsics/ceiling.f90 b/flang/test/Lower/Intrinsics/ceiling.f90
index 8c283de762e28..3c87bec3032e0 100644
--- a/flang/test/Lower/Intrinsics/ceiling.f90
+++ b/flang/test/Lower/Intrinsics/ceiling.f90
@@ -16,5 +16,3 @@ subroutine ceiling_test2(i, a)
     ! CHECK: %[[f:.*]] = math.ceil %{{.*}} : f32
     ! CHECK: fir.convert %[[f]] : (f32) -> i64
   end subroutine
-  
-  
diff --git a/flang/test/Lower/Intrinsics/count.f90 b/flang/test/Lower/Intrinsics/count.f90
index c3efe6b4bf077..064d01163985d 100644
--- a/flang/test/Lower/Intrinsics/count.f90
+++ b/flang/test/Lower/Intrinsics/count.f90
@@ -11,7 +11,7 @@ subroutine count_test1(rslt, mask)
     rslt = count(mask)
   ! CHECK:  %[[a5:.*]] = fir.call @_FortranACount(%[[a2]], %{{.*}}, %{{.*}}, %[[a4]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32) -> i64
   end subroutine
-  
+
   ! CHECK-LABEL: test_count2
   ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x?x!fir.logical<4>>>{{.*}})
   subroutine test_count2(rslt, mask)
@@ -29,7 +29,7 @@ subroutine test_count2(rslt, mask)
   ! CHECK:  %[[a12:.*]] = fir.box_addr %[[a10]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK:  fir.freemem %[[a12]]
   end subroutine
-  
+
   ! CHECK-LABEL: test_count3
   ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<i32>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>{{.*}})
   subroutine test_count3(rslt, mask)
diff --git a/flang/test/Lower/Intrinsics/cpu_time.f90 b/flang/test/Lower/Intrinsics/cpu_time.f90
index 25ff4f8821145..73eead3796444 100644
--- a/flang/test/Lower/Intrinsics/cpu_time.f90
+++ b/flang/test/Lower/Intrinsics/cpu_time.f90
@@ -8,4 +8,3 @@ subroutine cpu_time_test(t)
     ! CHECK: fir.store %[[result32]] to %arg0 : !fir.ref<f32>
     call cpu_time(t)
   end subroutine
-  
diff --git a/flang/test/Lower/Intrinsics/date_and_time.f90 b/flang/test/Lower/Intrinsics/date_and_time.f90
index 55b1383766cb8..d9ca46ea83dc6 100644
--- a/flang/test/Lower/Intrinsics/date_and_time.f90
+++ b/flang/test/Lower/Intrinsics/date_and_time.f90
@@ -18,13 +18,13 @@ subroutine date_and_time_test(date, time, zone, values)
     ! CHECK: fir.call @_FortranADateAndTime(%[[dateBuffer]], %[[dateLen]], %[[timeBuffer]], %[[timeLen]], %[[zoneBuffer]], %[[zoneLen]], %{{.*}}, %{{.*}}, %[[valuesCast]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> ()
     call date_and_time(date, time, zone, values)
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPdate_and_time_test2(
   ! CHECK-SAME: %[[date:.*]]: !fir.boxchar<1>{{.*}})
   subroutine date_and_time_test2(date)
     character(*) :: date
     ! CHECK: %[[dateUnbox:.*]]:2 = fir.unboxchar %[[date]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    ! CHECK: %[[values:.*]] = fir.absent !fir.box<none> 
+    ! CHECK: %[[values:.*]] = fir.absent !fir.box<none>
     ! CHECK: %[[dateBuffer:.*]] = fir.convert %[[dateUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
     ! CHECK: %[[dateLen:.*]] = fir.convert %[[dateUnbox]]#1 : (index) -> i64
     ! CHECK: %[[timeBuffer:.*]] = fir.convert %c0{{.*}} : (index) -> !fir.ref<i8>
@@ -34,7 +34,7 @@ subroutine date_and_time_test2(date)
     ! CHECK: fir.call @_FortranADateAndTime(%[[dateBuffer]], %[[dateLen]], %[[timeBuffer]], %[[timeLen]], %[[zoneBuffer]], %[[zoneLen]], %{{.*}}, %{{.*}}, %[[values]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i64, !fir.ref<i8>, i32, !fir.box<none>) -> ()
     call date_and_time(date)
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPdate_and_time_dynamic_optional(
   ! CHECK-SAME:  %[[VAL_0:[^:]*]]: !fir.boxchar<1>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
diff --git a/flang/test/Lower/Intrinsics/eoshift.f90 b/flang/test/Lower/Intrinsics/eoshift.f90
index 9cd0b86fadc52..4f01ce989c9f0 100644
--- a/flang/test/Lower/Intrinsics/eoshift.f90
+++ b/flang/test/Lower/Intrinsics/eoshift.f90
@@ -13,16 +13,16 @@ subroutine eoshift_test1(arr, shift)
   ! CHECK: fir.store %[[init]] to %[[resBox]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
   ! CHECK:  %[[boundBox:.*]] = fir.absent !fir.box<none>
   ! CHECK: %[[shift:.*]] = fir.load %arg1 : !fir.ref<i32>
-  
+
     res = eoshift(arr, shift)
-  
+
   ! CHECK: %[[resIRBox:.*]] = fir.convert %[[resBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK: %[[arrBox:.*]] = fir.convert %[[arr]] : (!fir.box<!fir.array<3x!fir.logical<4>>>) -> !fir.box<none>
   ! CHECK: %[[shiftBox:.*]] = fir.convert %[[shift]] : (i32) -> i64
   ! CHECK: fir.call @_FortranAEoshiftVector(%[[resIRBox]], %[[arrBox]], %[[shiftBox]], %[[boundBox]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i64, !fir.box<none>, !fir.ref<i8>, i32) -> ()
   ! CHECK: fir.array_merge_store %[[resLoad]], {{.*}} to %[[res]] : !fir.array<3x!fir.logical<4>>, !fir.array<3x!fir.logical<4>>, !fir.ref<!fir.array<3x!fir.logical<4>>>
   end subroutine eoshift_test1
-  
+
   ! CHECK-LABEL: eoshift_test2
   subroutine eoshift_test2(arr, shift, bound, dim)
     integer, dimension(3,3) :: arr, res
@@ -31,9 +31,9 @@ subroutine eoshift_test2(arr, shift, bound, dim)
   ! CHECK: %[[resBox:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>>
   ! CHECK: %[[res:.*]] = fir.alloca !fir.array<3x3xi32> {bindc_name = "res", uniq_name = "_QFeoshift_test2Eres"}
   !CHECK: %[[resLoad:.*]] = fir.array_load %[[res]]({{.*}}) : (!fir.ref<!fir.array<3x3xi32>>, !fir.shape<2>) -> !fir.array<3x3xi32>
-    
+
     res = eoshift(arr, shift, bound, dim)
-  
+
   ! CHECK: %[[arr:.*]] = fir.embox %arg0({{.*}}) : (!fir.ref<!fir.array<3x3xi32>>, !fir.shape<2>) -> !fir.box<!fir.array<3x3xi32>>
   ! CHECK: %[[boundBox:.*]] = fir.embox %arg2 : (!fir.ref<i32>) -> !fir.box<i32>
   ! CHECK: %[[dim:.*]] = fir.load %arg3 : !fir.ref<i32>
@@ -42,16 +42,16 @@ subroutine eoshift_test2(arr, shift, bound, dim)
   ! CHECK: %[[arrBox:.*]] = fir.convert %[[arr]] : (!fir.box<!fir.array<3x3xi32>>) -> !fir.box<none>
   ! CHECK: %[[shiftBoxNone:.*]] = fir.convert %[[shiftBox]] : (!fir.box<!fir.array<3xi32>>) -> !fir.box<none>
   ! CHECK: %[[boundBoxNone:.*]] = fir.convert %[[boundBox]] : (!fir.box<i32>) -> !fir.box<none>
-  
+
   ! CHECK: fir.call @_FortranAEoshift(%[[resIRBox]], %[[arrBox]], %[[shiftBoxNone]], %[[boundBoxNone]], %[[dim]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK: fir.array_merge_store %[[resLoad]], {{.*}} to %[[res]] : !fir.array<3x3xi32>, !fir.array<3x3xi32>, !fir.ref<!fir.array<3x3xi32>>
   end subroutine eoshift_test2
-  
+
   ! CHECK-LABEL: eoshift_test3
   subroutine eoshift_test3(arr, shift, dim)
     character(4), dimension(3,3) :: arr, res
     integer :: shift, dim
-  
+
   ! CHECK: %[[resBox:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,4>>>>
   ! CHECK: %[[arr:.*]]:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
   ! CHECK: %[[array:.*]] = fir.convert %[[arr]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.array<3x3x!fir.char<1,4>>>
@@ -59,9 +59,9 @@ subroutine eoshift_test3(arr, shift, dim)
   ! CHECK: %[[resLoad:.*]] = fir.array_load %[[res]]({{.*}}) : (!fir.ref<!fir.array<3x3x!fir.char<1,4>>>, !fir.shape<2>) -> !fir.array<3x3x!fir.char<1,4>>
   ! CHECK: %[[arrayBox:.*]] = fir.embox %[[array]]({{.*}}) : (!fir.ref<!fir.array<3x3x!fir.char<1,4>>>, !fir.shape<2>) -> !fir.box<!fir.array<3x3x!fir.char<1,4>>>
   ! CHECK: %[[dim:.*]] = fir.load %arg2 : !fir.ref<i32>
-  
+
     res = eoshift(arr, SHIFT=shift, DIM=dim)
-  
+
   ! CHECK: %[[boundBox:.*]] = fir.absent !fir.box<none>
   ! CHECK: %[[shiftBox:.*]] = fir.embox %arg1 : (!fir.ref<i32>) -> !fir.box<i32>
   ! CHECK: %[[resIRBox:.*]] = fir.convert %[[resBox]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.char<1,4>>>>>) -> !fir.ref<!fir.box<none>>
@@ -70,7 +70,7 @@ subroutine eoshift_test3(arr, shift, dim)
   ! CHECK: fir.call @_FortranAEoshift(%[[resIRBox]], %[[arrayBoxNone]], %[[shiftBoxNone]], %[[boundBox]], %[[dim]], {{.*}}, {{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
   ! CHECK: fir.array_merge_store %[[resLoad]], {{.*}} to %[[res]] : !fir.array<3x3x!fir.char<1,4>>, !fir.array<3x3x!fir.char<1,4>>, !fir.ref<!fir.array<3x3x!fir.char<1,4>>>
   end subroutine eoshift_test3
-  
+
   ! CHECK-LABEL: func @_QPeoshift_test_dynamic_optional(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<i32>
diff --git a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
index f8c667f3fa82d..a4137dfd47f79 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
@@ -12,9 +12,9 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
   LOGICAL, OPTIONAL :: isWait
   ! Note: command is not optional in execute_command_line and must be present
   call execute_command_line(command, isWait, exitVal, cmdVal, msg)
-! CHECK-NEXT:    %[[c14:.*]] = arith.constant 14 : i32 
-! CHECK-NEXT:    %true = arith.constant true 
-! CHECK-NEXT:    %[[c0:.*]] = arith.constant 0 : i64 
+! CHECK-NEXT:    %[[c14:.*]] = arith.constant 14 : i32
+! CHECK-NEXT:    %true = arith.constant true
+! CHECK-NEXT:    %[[c0:.*]] = arith.constant 0 : i64
 ! CHECK-NEXT:    %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:    %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
 ! CHECK-NEXT:    %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
@@ -35,7 +35,7 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:    %[[cmdstatArgBox:.*]] = fir.embox %[[cmdstatDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:    %[[cmdstatBox:.*]] = arith.select %[[cmdstatIsPresent]], %[[cmdstatArgBox]], %[[absentBoxi32]] : !fir.box<i32>
 ! CHECK-NEXT:    %[[cmdmsgArgBox:.*]] = fir.embox %[[cmdmsgDeclare]] typeparams %[[cmdmsgUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
-! CHECK-NEXT:    %[[absentBox:.*]] = fir.absent !fir.box<!fir.char<1,?>> 
+! CHECK-NEXT:    %[[absentBox:.*]] = fir.absent !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[cmdmsgBox:.*]] = arith.select %[[cmdmsgIsPresent]], %[[cmdmsgArgBox]], %[[absentBox]] : !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[waitCast:.*]] = fir.convert %[[waitDeclare]]  : (!fir.ref<!fir.logical<4>>) -> i64
 ! CHECK-NEXT:    %[[waitPresent:.*]] = arith.cmpi ne, %[[waitCast]], %[[c0]] : i64
diff --git a/flang/test/Lower/Intrinsics/execute_command_line.f90 b/flang/test/Lower/Intrinsics/execute_command_line.f90
index e70513068ab3e..e29c09688e6d1 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line.f90
@@ -11,9 +11,9 @@ subroutine all_args(command, isWait, exitVal, cmdVal, msg)
 INTEGER :: exitVal, cmdVal
 LOGICAL :: isWait
 call execute_command_line(command, isWait, exitVal, cmdVal, msg)
-! CHECK-NEXT:        %[[c13:.*]] = arith.constant 13 : i32 
-! CHECK-NEXT:        %true = arith.constant true 
-! CHECK-NEXT:        %[[c0:.*]] = arith.constant 0 : i64 
+! CHECK-NEXT:        %[[c13:.*]] = arith.constant 13 : i32
+! CHECK-NEXT:        %true = arith.constant true
+! CHECK-NEXT:        %[[c0:.*]] = arith.constant 0 : i64
 ! CHECK-NEXT:        %[[c30:.*]] = arith.constant 30 : index
 ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:        %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
@@ -51,8 +51,8 @@ end subroutine all_args
 subroutine only_command_default_wait_true(command)
 CHARACTER(30) :: command
 call execute_command_line(command)
-! CHECK-NEXT:     %[[c52:.*]] = arith.constant 53 : i32 
-! CHECK-NEXT:     %true = arith.constant true 
+! CHECK-NEXT:     %[[c52:.*]] = arith.constant 53 : i32
+! CHECK-NEXT:     %true = arith.constant true
 ! CHECK-NEXT:     %[[c30:.*]] = arith.constant 30 : index
 ! CHECK-NEXT:        %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
 ! CHECK-NEXT:     %[[commandUnbox:.*]]:2 = fir.unboxchar %[[cmdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
@@ -60,7 +60,7 @@ subroutine only_command_default_wait_true(command)
 ! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] dummy_scope %[[DSCOPE]] arg {{[0-9]+}} {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[absent:.*]] = fir.absent !fir.box<none>
-! CHECK:          %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none> 
+! CHECK:          %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none>
 ! CHECK:          fir.call @_FortranAExecuteCommandLine(%[[command]], %true, %[[absent]], %[[absent]], %[[absent]], %[[VAL_7:.*]], %[[c52]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
 ! CHECK-NEXT:     return
 end subroutine only_command_default_wait_true
diff --git a/flang/test/Lower/Intrinsics/exit.f90 b/flang/test/Lower/Intrinsics/exit.f90
index d80efc556f95e..49b41346cac0d 100644
--- a/flang/test/Lower/Intrinsics/exit.f90
+++ b/flang/test/Lower/Intrinsics/exit.f90
@@ -10,7 +10,7 @@ subroutine exit_test1
   ! CHECK-32: fir.call @_FortranAExit(%[[status]]) {{.*}}: (i32) -> ()
   ! CHECK-64: fir.call @_FortranAExit(%[[statusConvert]]) {{.*}}: (i32) -> ()
   end subroutine exit_test1
-  
+
   ! CHECK-LABEL: func @_QPexit_test2(
   ! CHECK-SAME: %[[statusArg:.*]]: !fir.ref<i[[DEFAULT_INTEGER_SIZE]]>{{.*}}) {
   subroutine exit_test2(status)
diff --git a/flang/test/Lower/Intrinsics/extends_type_of.f90 b/flang/test/Lower/Intrinsics/extends_type_of.f90
index f99a63e30a552..d69e35ff934d9 100644
--- a/flang/test/Lower/Intrinsics/extends_type_of.f90
+++ b/flang/test/Lower/Intrinsics/extends_type_of.f90
@@ -9,7 +9,7 @@ module extends_type_of_mod
   type, extends(p1) :: p2
     integer :: b
   end type
- 
+
   type k1(a)
     integer, kind :: a
   end type
diff --git a/flang/test/Lower/Intrinsics/floor.f90 b/flang/test/Lower/Intrinsics/floor.f90
index 63d6d2fccee86..b478b6732efeb 100644
--- a/flang/test/Lower/Intrinsics/floor.f90
+++ b/flang/test/Lower/Intrinsics/floor.f90
@@ -16,4 +16,3 @@ subroutine floor_test2(i, a)
     ! CHECK: %[[f:.*]] = math.floor %{{.*}} : f32
     ! CHECK: fir.convert %[[f]] : (f32) -> i64
   end subroutine
-  
diff --git a/flang/test/Lower/Intrinsics/get_command_argument-optional.f90 b/flang/test/Lower/Intrinsics/get_command_argument-optional.f90
index c1b081b6112b9..545ca663feeb7 100644
--- a/flang/test/Lower/Intrinsics/get_command_argument-optional.f90
+++ b/flang/test/Lower/Intrinsics/get_command_argument-optional.f90
@@ -7,11 +7,11 @@
 ! CHECK-SAME:  %[[lengthParam:.*]]: !fir.ref<i32> {fir.bindc_name = "length", fir.optional},
 ! CHECK-SAME:  %[[statusParam:.*]]: !fir.ref<i32> {fir.bindc_name = "status", fir.optional},
 ! CHECK-SAME:  %[[errmsgParam:.*]]: !fir.boxchar<1> {fir.bindc_name = "errmsg", fir.optional}) {
-subroutine test(number, value, length, status, errmsg) 
+subroutine test(number, value, length, status, errmsg)
   integer, optional :: number, status, length
   character(*), optional :: value, errmsg
   ! Note: number cannot be absent
-  call get_command_argument(number, value, length, status, errmsg) 
+  call get_command_argument(number, value, length, status, errmsg)
 ! CHECK:  %[[errmsgUnboxed:.*]]:2 = fir.unboxchar %[[errmsgParam]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[valueUnboxed:.*]]:2 = fir.unboxchar %[[valueParam]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK:  %[[number:.*]] = fir.load %[[numberParam]] : !fir.ref<i32>
diff --git a/flang/test/Lower/Intrinsics/ichar.f90 b/flang/test/Lower/Intrinsics/ichar.f90
index 99284455be097..eb7e03873e6b7 100644
--- a/flang/test/Lower/Intrinsics/ichar.f90
+++ b/flang/test/Lower/Intrinsics/ichar.f90
@@ -37,7 +37,7 @@ subroutine ichar_test(c)
 subroutine no_extui(ch)
   integer, parameter :: kind = selected_char_kind('ISO_10646')
   character(*, kind), intent(in) :: ch(:)
-  integer :: i, j 
+  integer :: i, j
   ! CHECK-NOT: arith.extui
   j = ichar(ch(i)(i:i))
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/ishftc.f90 b/flang/test/Lower/Intrinsics/ishftc.f90
index 70d71128cf9cf..f13d9c06a8197 100644
--- a/flang/test/Lower/Intrinsics/ishftc.f90
+++ b/flang/test/Lower/Intrinsics/ishftc.f90
@@ -40,7 +40,7 @@ function ishftc_test(i, j, k)
     ! CHECK: return %[[VAL_36]] : i32
     ishftc_test = ishftc(i, j, k)
   end
-  
+
   ! Test cases where the size argument presence can only be know at runtime
   module test_ishftc
   contains
@@ -67,9 +67,9 @@ subroutine dyn_optional_scalar(i, shift, size)
     ! CHECK:  %[[VAL_19:.*]] = arith.xori %[[VAL_9]], %[[VAL_18]] : i32
     ! CHECK:  %[[VAL_20:.*]] = arith.subi %[[VAL_19]], %[[VAL_18]] : i32
     ! CHECK:  %[[VAL_21:.*]] = arith.subi %[[VAL_11]], %[[VAL_20]] : i32
-    ! ... as in non optional case 
+    ! ... as in non optional case
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QMtest_ishftcPdyn_optional_array_scalar(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "shift"},
@@ -90,11 +90,11 @@ subroutine dyn_optional_array_scalar(i, shift, size)
   ! CHECK:      %[[VAL_26:.*]] = arith.constant 32 : i32
   ! CHECK:      fir.result %[[VAL_26]] : i32
   ! CHECK:    }
-  ! ... as in non optional case 
+  ! ... as in non optional case
   ! CHECK:  }
     print *, ishftc(i, shift, size)
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QMtest_ishftcPdyn_optional_array(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "i"},
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "shift"},
@@ -117,22 +117,22 @@ subroutine dyn_optional_array(i, shift, size)
   ! CHECK:      %[[VAL_32:.*]] = arith.constant 32 : i32
   ! CHECK:      fir.result %[[VAL_32]] : i32
   ! CHECK:    }
-  ! ... as in non optional case 
+  ! ... as in non optional case
   ! CHECK:    }
     print *, ishftc(i, shift, size)
   end subroutine
   end module
-  
+
     use test_ishftc
     integer :: i(4) = [333, 334, 335, 336]
     integer :: shift(4) = [2, 1, -1, -2]
     integer :: size(4) = [2, 4, 8, 16]
     call dyn_optional_scalar(i(1), shift(1))
     call dyn_optional_scalar(i(1), shift(1), size(1))
-  
+
     call dyn_optional_array_scalar(i, shift)
     call dyn_optional_array_scalar(i, shift, size(1))
-  
+
     call dyn_optional_array(i, shift)
     call dyn_optional_array(i, shift, size)
   end
diff --git a/flang/test/Lower/Intrinsics/max.f90 b/flang/test/Lower/Intrinsics/max.f90
index 1909a4eca3f67..c3d2457ef1af1 100644
--- a/flang/test/Lower/Intrinsics/max.f90
+++ b/flang/test/Lower/Intrinsics/max.f90
@@ -31,8 +31,8 @@ subroutine dynamic_optional(a, b, c)
     ! CHECK:    fir.result %[[VAL_36]] : !fir.array<?xi32>
     ! CHECK:  }
       print *, max(a, b, c)
-    end subroutine 
-    
+    end subroutine
+
     ! CHECK-LABEL: func @_QMmax_testPdynamic_optional_array_expr_scalar_optional(
     ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
     ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"},
@@ -60,8 +60,8 @@ subroutine dynamic_optional_array_expr_scalar_optional(a, b, c)
     ! CHECK:    %[[VAL_30:.*]] = fir.array_update %[[VAL_21]], %[[VAL_26]], %[[VAL_20]] : (!fir.array<?xi32>, i32, index) -> !fir.array<?xi32>
     ! CHECK:    fir.result %[[VAL_30]] : !fir.array<?xi32>
     ! CHECK:  }
-    end subroutine 
-    
+    end subroutine
+
     ! CHECK-LABEL: func @_QMmax_testPdynamic_optional_scalar(
     ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
     ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
@@ -84,8 +84,8 @@ subroutine dynamic_optional_scalar(a, b, c)
     ! CHECK:    fir.result %[[VAL_12]] : i32
     ! CHECK:  }
     ! CHECK:  fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[VAL_13]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    end subroutine 
-    
+    end subroutine
+
     ! CHECK-LABEL: func @_QMmax_testPdynamic_optional_weird(
     ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
     ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "b"},
@@ -123,9 +123,9 @@ subroutine dynamic_optional_weird(a, b, c, d, e)
     ! CHECK:    fir.result %[[VAL_23]] : i32
     ! CHECK:  }
     ! CHECK:  fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[VAL_24]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
-    end subroutine 
+    end subroutine
     end module
-    
+
       use :: max_test
       integer :: a(4) = [1,12,23, 34]
       integer :: b(4) = [31,22,13, 4]
diff --git a/flang/test/Lower/Intrinsics/maxloc.f90 b/flang/test/Lower/Intrinsics/maxloc.f90
index 87f17881e0476..13dbe984043b6 100644
--- a/flang/test/Lower/Intrinsics/maxloc.f90
+++ b/flang/test/Lower/Intrinsics/maxloc.f90
@@ -18,7 +18,7 @@ subroutine maxloc_test(arr,res)
   ! CHECK-DAG: %[[a14:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK-DAG: fir.freemem %[[a14]]
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPmaxloc_test2(
   ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[arg2:.*]]: !fir.ref<i32>{{.*}}) {
   subroutine maxloc_test2(arr,res,d)
@@ -39,7 +39,7 @@ subroutine maxloc_test2(arr,res,d)
   ! CHECK:  %[[a13:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
   ! CHECK:  fir.freemem %[[a13]]
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_maxloc_optional_scalar_mask(
   ! CHECK-SAME:  %[[VAL_0:[^:]+]]: !fir.ref<!fir.logical<4>>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>
@@ -65,7 +65,7 @@ subroutine test_maxloc_optional_scalar_mask(mask, back, array)
   ! CHECK:  %[[VAL_30:.*]] = fir.convert %[[VAL_14]] : (!fir.logical<4>) -> i1
   ! CHECK:  fir.call @_FortranAMaxlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_29]], %[[VAL_30]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_maxloc_optional_array_mask(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/Intrinsics/merge.f90 b/flang/test/Lower/Intrinsics/merge.f90
index 2e17efcaf5c2a..52417f83294b6 100644
--- a/flang/test/Lower/Intrinsics/merge.f90
+++ b/flang/test/Lower/Intrinsics/merge.f90
@@ -9,7 +9,7 @@ function merge_test(o1, o2, mask)
 merge_test = merge(o1, o2, mask)
 ! CHECK:  %[[a0:.*]]:2 = fir.unboxchar %[[arg2]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK: %[[a0_cast:.*]] = fir.convert %[[a0]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1>>
-! CHECK:  %[[a1:.*]]:2 = fir.unboxchar %[[arg3]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) 
+! CHECK:  %[[a1:.*]]:2 = fir.unboxchar %[[arg3]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK: %[[a1_cast:.*]] = fir.convert %[[a1]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1>>
 ! CHECK: %[[a2:.*]] = fir.load %[[arg4]] : !fir.ref<!fir.logical<4>>
 ! CHECK: %[[a3:.*]] = fir.convert %[[a2]] : (!fir.logical<4>) -> i1
diff --git a/flang/test/Lower/Intrinsics/minloc.f90 b/flang/test/Lower/Intrinsics/minloc.f90
index caab36d0f8138..fa3bc9b67ad91 100644
--- a/flang/test/Lower/Intrinsics/minloc.f90
+++ b/flang/test/Lower/Intrinsics/minloc.f90
@@ -18,7 +18,7 @@ subroutine minloc_test(arr,res)
   ! CHECK-DAG: %[[a14:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK-DAG: fir.freemem %[[a14]]
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPminloc_test2(
   ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[arg2:.*]]: !fir.ref<i32>
   subroutine minloc_test2(arr,res,d)
@@ -39,7 +39,7 @@ subroutine minloc_test2(arr,res,d)
   ! CHECK:  %[[a13:.*]] = fir.box_addr %[[a12]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
   ! CHECK:  fir.freemem %[[a13]]
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_minloc_optional_scalar_mask(
   ! CHECK-SAME:  %[[VAL_0:[^:]+]]: !fir.ref<!fir.logical<4>>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>
@@ -65,7 +65,7 @@ subroutine test_minloc_optional_scalar_mask(mask, back, array)
   ! CHECK:  %[[VAL_30:.*]] = fir.convert %[[VAL_14]] : (!fir.logical<4>) -> i1
   ! CHECK:  fir.call @_FortranAMinlocInteger4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_29]], %[[VAL_30]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> ()
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_minloc_optional_array_mask(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/Intrinsics/modulo.f90 b/flang/test/Lower/Intrinsics/modulo.f90
index 37c4cd1a94ca2..460000a923447 100644
--- a/flang/test/Lower/Intrinsics/modulo.f90
+++ b/flang/test/Lower/Intrinsics/modulo.f90
@@ -20,7 +20,7 @@ subroutine modulo_testr(r, a, p)
   ! ALL: fir.store %[[res]] to %[[arg0]] : !fir.ref<f64>
   r = modulo(a, p)
 end subroutine
-  
+
 ! ALL-LABEL: func @_QPmodulo_testi(
 ! ALL-SAME: %[[arg0:.*]]: !fir.ref<i64>{{.*}}, %[[arg1:.*]]: !fir.ref<i64>{{.*}}, %[[arg2:.*]]: !fir.ref<i64>{{.*}}) {
 subroutine modulo_testi(r, a, p)
diff --git a/flang/test/Lower/Intrinsics/nint.f90 b/flang/test/Lower/Intrinsics/nint.f90
index 2f25eda5d20b2..166fdac90d8e1 100644
--- a/flang/test/Lower/Intrinsics/nint.f90
+++ b/flang/test/Lower/Intrinsics/nint.f90
@@ -14,4 +14,3 @@ subroutine nint_test2(i, a)
     i = nint(a, 8)
     ! CHECK: fir.call @llvm.lround.i64.f64
   end subroutine
-  
\ No newline at end of file
diff --git a/flang/test/Lower/Intrinsics/not.f90 b/flang/test/Lower/Intrinsics/not.f90
index 140800c27e878..b772e8b8b37cf 100644
--- a/flang/test/Lower/Intrinsics/not.f90
+++ b/flang/test/Lower/Intrinsics/not.f90
@@ -13,4 +13,3 @@ subroutine not_test
     ! CHECK: return
     destination = not(source)
   end subroutine
-  
\ No newline at end of file
diff --git a/flang/test/Lower/Intrinsics/pack.f90 b/flang/test/Lower/Intrinsics/pack.f90
index a00c10dc2e959..f4eeef7747a82 100644
--- a/flang/test/Lower/Intrinsics/pack.f90
+++ b/flang/test/Lower/Intrinsics/pack.f90
@@ -21,7 +21,7 @@ subroutine pack_test(a,m,v,r)
   ! CHECK:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK:  fir.freemem %[[a13]]
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_pack_optional(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
   subroutine test_pack_optional(vector, array, mask)
diff --git a/flang/test/Lower/Intrinsics/perror.f90 b/flang/test/Lower/Intrinsics/perror.f90
index e746e73a5f9bc..a595ab54746bf 100644
--- a/flang/test/Lower/Intrinsics/perror.f90
+++ b/flang/test/Lower/Intrinsics/perror.f90
@@ -11,13 +11,13 @@ subroutine test_perror()
   ! CHECK: %[[C10:.*]] = arith.constant 10 : index
   ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.char<1,10> {bindc_name = "string", uniq_name = "_QFtest_perrorEstring"}
   ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[C10]] {uniq_name = "_QFtest_perrorEstring"} : (!fir.ref<!fir.char<1,10>>, index) -> (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>)
-  
+
   call perror(string)
   ! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0 : (!fir.ref<!fir.char<1,10>>) -> !fir.box<!fir.char<1,10>>
   ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.char<1,10>>) -> !fir.ref<!fir.char<1,10>>
   ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.char<1,10>>) -> !fir.ref<i8>
-  ! CHECK: fir.call @_FortranAPerror(%[[VAL_6]]) fastmath<contract> : (!fir.ref<i8>) -> () 
-  
+  ! CHECK: fir.call @_FortranAPerror(%[[VAL_6]]) fastmath<contract> : (!fir.ref<i8>) -> ()
+
   call perror("prefix")
   ! CHECK: %[[VAL_7:.*]] = fir.address_of(@{{.*}}) : !fir.ref<!fir.char<1,6>>
   ! CHECK: %[[C6:.*]] = arith.constant 6 : index
@@ -25,13 +25,13 @@ subroutine test_perror()
   ! CHECK: %[[VAL_9:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref<!fir.char<1,6>>) -> !fir.box<!fir.char<1,6>>
   ! CHECK: %[[VAL_10:.*]] = fir.box_addr %[[VAL_9]] : (!fir.box<!fir.char<1,6>>) -> !fir.ref<!fir.char<1,6>>
   ! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.char<1,6>>) -> !fir.ref<i8>
-  ! CHECK: fir.call @_FortranAPerror(%[[VAL_11]]) fastmath<contract> : (!fir.ref<i8>) -> () 
-  
+  ! CHECK: fir.call @_FortranAPerror(%[[VAL_11]]) fastmath<contract> : (!fir.ref<i8>) -> ()
+
   call perror(one)
   ! CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
   ! CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_12]] : (!fir.box<!fir.char<1>>) -> !fir.ref<!fir.char<1>>
   ! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-  ! CHECK: fir.call @_FortranAPerror(%[[VAL_14]]) fastmath<contract> : (!fir.ref<i8>) -> () 
+  ! CHECK: fir.call @_FortranAPerror(%[[VAL_14]]) fastmath<contract> : (!fir.ref<i8>) -> ()
 end subroutine test_perror
 
 ! CHECK-LABEL: func @_QPtest_perror_unknown_length(
diff --git a/flang/test/Lower/Intrinsics/product.f90 b/flang/test/Lower/Intrinsics/product.f90
index df7c1e4ce7eaa..c64982e435f07 100644
--- a/flang/test/Lower/Intrinsics/product.f90
+++ b/flang/test/Lower/Intrinsics/product.f90
@@ -111,7 +111,7 @@ real function product_test_optional_4(x, use_mask)
 real :: x(:)
 logical :: use_mask
 logical, allocatable :: mask(:)
-if (use_mask) then 
+if (use_mask) then
   allocate(mask(size(x, 1)))
   call set_mask(mask)
   ! CHECK: fir.call @_QPset_mask
diff --git a/flang/test/Lower/Intrinsics/reduce.f90 b/flang/test/Lower/Intrinsics/reduce.f90
index 083dca5c3cd9f..27c4277ffebeb 100644
--- a/flang/test/Lower/Intrinsics/reduce.f90
+++ b/flang/test/Lower/Intrinsics/reduce.f90
@@ -19,7 +19,7 @@ pure function red_int1_interface_value(a, b)
 
   integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
   integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
-  
+
 
 contains
 
@@ -46,11 +46,11 @@ subroutine integer1(a, id, d1, d2)
   res = reduce(a, red_int1)
 
   res = reduce(a, red_int1, identity=id)
-  
+
   res = reduce(a, red_int1, identity=id, ordered = .true.)
 
   res = reduce(a, red_int1, [.true., .true., .false.])
-  
+
   res = reduce(a, red_int1_value)
 
   fptr => red_int1
diff --git a/flang/test/Lower/Intrinsics/reshape.f90 b/flang/test/Lower/Intrinsics/reshape.f90
index 4f4f50965dd1b..b960a3e380786 100644
--- a/flang/test/Lower/Intrinsics/reshape.f90
+++ b/flang/test/Lower/Intrinsics/reshape.f90
@@ -24,16 +24,16 @@ subroutine reshape_test(x, source, pd, sh, ord)
   ! CHECK-DAG:  %[[a18:.*]] = fir.box_addr %[[a15]] : (!fir.box<!fir.heap<!fir.array<?x?xi32>>>) -> !fir.heap<!fir.array<?x?xi32>>
   ! CHECK-DAG:  fir.freemem %[[a18]]
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtest_reshape_optional(
   ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
   ! CHECK-SAME:  %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
   subroutine test_reshape_optional(pad, order, source, shape)
-    real, pointer :: pad(:, :) 
-    integer, pointer :: order(:) 
+    real, pointer :: pad(:, :)
+    integer, pointer :: order(:)
     real :: source(:, :, :)
-    integer :: shape(4) 
-    print *, reshape(source=source, shape=shape, pad=pad, order=order)  
+    integer :: shape(4)
+    print *, reshape(source=source, shape=shape, pad=pad, order=order)
   ! CHECK:  %[[VAL_13:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
   ! CHECK:  %[[VAL_14:.*]] = fir.box_addr %[[VAL_13]] : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>) -> !fir.ptr<!fir.array<?x?xf32>>
   ! CHECK:  %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.ptr<!fir.array<?x?xf32>>) -> i64
diff --git a/flang/test/Lower/Intrinsics/scale.f90 b/flang/test/Lower/Intrinsics/scale.f90
index 9c97349d1dd57..9034c48e639b5 100644
--- a/flang/test/Lower/Intrinsics/scale.f90
+++ b/flang/test/Lower/Intrinsics/scale.f90
@@ -14,7 +14,7 @@ subroutine scale_test1(x, i)
   ! CHECK: %[[tmp:.*]] = fir.call @_FortranAScale4(%[[x_val]], %[[i_cast]]) {{.*}}: (f32, i64) -> f32
   ! CHECK: hlfir.assign %[[tmp]] to %[[res]]#0 : f32, !fir.ref<f32>
 end subroutine scale_test1
-  
+
 ! CHECK-LABEL: scale_test2
 subroutine scale_test2(x, i)
   real(kind=8) :: x, res
diff --git a/flang/test/Lower/Intrinsics/spread.f90 b/flang/test/Lower/Intrinsics/spread.f90
index 3c20ec29ebc11..d4d16a6637767 100644
--- a/flang/test/Lower/Intrinsics/spread.f90
+++ b/flang/test/Lower/Intrinsics/spread.f90
@@ -30,7 +30,7 @@ subroutine spread_test(s,d,n,r)
   ! CHECK-DAG:  %[[a15:.*]] = fir.box_addr %[[a13]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
   ! CHECK:  fir.freemem %[[a15]]
 end subroutine
-  
+
 ! CHECK-LABEL: func @_QMspread_modPspread_test2(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[arg1:[^:]+]]: !fir.ref<i32>{{.*}}, %[[arg2:[^:]+]]: !fir.ref<i32>{{.*}}, %[[arg3:.*]]: !fir.box<!fir.array<?x?xi32>>{{.*}}) {
 subroutine spread_test2(s,d,n,r)
diff --git a/flang/test/Lower/Intrinsics/sum.f90 b/flang/test/Lower/Intrinsics/sum.f90
index 3167617b60457..454d564684e89 100644
--- a/flang/test/Lower/Intrinsics/sum.f90
+++ b/flang/test/Lower/Intrinsics/sum.f90
@@ -111,7 +111,7 @@ integer function sum_test_optional_4(x, use_mask)
 integer :: x(:)
 logical :: use_mask
 logical, allocatable :: mask(:)
-if (use_mask) then 
+if (use_mask) then
   allocate(mask(size(x, 1)))
   call set_mask(mask)
   ! CHECK: fir.call @_QPset_mask
diff --git a/flang/test/Lower/Intrinsics/system.f90 b/flang/test/Lower/Intrinsics/system.f90
index 6ea98bca7de72..183725cf29133 100644
--- a/flang/test/Lower/Intrinsics/system.f90
+++ b/flang/test/Lower/Intrinsics/system.f90
@@ -1,8 +1,8 @@
 ! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func.func @_QPall_args(
-! CHECK-SAME:    %[[commandArg:.*]]: !fir.boxchar<1> {fir.bindc_name = "command"}, 
-! CHECK-SAME:    %[[exitstatArg:.*]]: !fir.ref<i32> {fir.bindc_name = "exitstat"}) { 
+! CHECK-SAME:    %[[commandArg:.*]]: !fir.boxchar<1> {fir.bindc_name = "command"},
+! CHECK-SAME:    %[[exitstatArg:.*]]: !fir.ref<i32> {fir.bindc_name = "exitstat"}) {
 subroutine all_args(command, exitstat)
 CHARACTER(*) :: command
 INTEGER :: exitstat
diff --git a/flang/test/Lower/Intrinsics/transfer.f90 b/flang/test/Lower/Intrinsics/transfer.f90
index 2cc7e93f86f51..a792c8e91ba01 100644
--- a/flang/test/Lower/Intrinsics/transfer.f90
+++ b/flang/test/Lower/Intrinsics/transfer.f90
@@ -27,7 +27,7 @@ subroutine trans_test(store, word)
     real :: word
     store = transfer(word, store)
   end subroutine
-  
+
   ! CHECK-LABEL: func @_QPtrans_test2(
   ! CHECK-SAME:        %[[VAL_0:.*]]: !fir.ref<!fir.array<3xi32>>{{.*}}, %[[VAL_1:.*]]: !fir.ref<f32>{{.*}}) {
   ! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
@@ -69,13 +69,13 @@ subroutine trans_test(store, word)
   ! CHECK:         fir.freemem %[[VAL_25]]
   ! CHECK:         return
   ! CHECK:       }
-  
+
   subroutine trans_test2(store, word)
     integer :: store(3)
     real :: word
     store = transfer(word, store, 3)
   end subroutine
-  
+
   integer function trans_test3(p)
     ! CHECK-LABEL: func @_QPtrans_test3(
     ! CHECK-SAME:                       %[[VAL_0:.*]]: !fir.ref<i32>{{.*}}) -> i32 {
diff --git a/flang/test/Lower/Intrinsics/unlink-sub.f90 b/flang/test/Lower/Intrinsics/unlink-sub.f90
index ac535005fd442..3b5c22adf58ea 100644
--- a/flang/test/Lower/Intrinsics/unlink-sub.f90
+++ b/flang/test/Lower/Intrinsics/unlink-sub.f90
@@ -41,7 +41,7 @@ subroutine all_arguments(path, status)
     !CHECK:        %[[unlink_result:.*]] = fir.call @_FortranAUnlink(%[[path]], %[[path_len]], %[[src_path]], %[[line]])
     !CHECK-SAME:   : (!fir.ref<i8>, i64, !fir.ref<i8>, i32)
     !CHECK-SAME:   -> i32
-  
+
     !CHECK-DAG:    %[[status_i64:.*]] = fir.convert %[[status_decl]]#0 : (!fir.ref<i32>) -> i64
     !CHECK-DAG:    %[[c_null:.*]] = arith.constant 0 : i64
     !CHECK-DAG:    %[[cmp_result:.*]] = arith.cmpi ne, %[[status_i64]], %[[c_null]] : i64

From e5c418ff1146bbc014af96501937ee3ea23c26af Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 13 Nov 2025 08:02:47 -0800
Subject: [PATCH 03/25] [NFC][TableGen] Adopt CodeGenHelpers in
 SubtargetEmitter (#163820)

- Adopt ifdef and namespace emitters in SubtargeEmitter.
- To aid that, factor out emission of different sections of the code
into individual helper functions.
---
 llvm/include/llvm/TableGen/CodeGenHelpers.h |  44 +++++-
 llvm/utils/TableGen/SubtargetEmitter.cpp    | 165 ++++++++++----------
 2 files changed, 119 insertions(+), 90 deletions(-)

diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index 95866e306b5ff..e357b2670be15 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -21,18 +21,37 @@
 
 namespace llvm {
 
-// Simple RAII helper for emitting ifdef-undef-endif scope.
+// Simple RAII helper for emitting ifdef-undef-endif scope. `LateUndef` controls
+// whether the undef is emitted at the start of the scope (false) or at the end
+// of the scope (true).
 class IfDefEmitter {
 public:
-  IfDefEmitter(raw_ostream &OS, StringRef Name) : Name(Name.str()), OS(OS) {
-    OS << "#ifdef " << Name << "\n"
-       << "#undef " << Name << "\n\n";
+  IfDefEmitter(raw_ostream &OS, StringRef Name, bool LateUndef = false)
+      : Name(Name.str()), OS(OS), LateUndef(LateUndef) {
+    OS << "#ifdef " << Name << "\n";
+    if (!LateUndef)
+      OS << "#undef " << Name << "\n";
+    OS << "\n";
+  }
+  ~IfDefEmitter() { close(); }
+
+  // Explicit function to close the ifdef scopes.
+  void close() {
+    if (Closed)
+      return;
+
+    OS << "\n";
+    if (LateUndef)
+      OS << "#undef " << Name << "\n";
+    OS << "#endif // " << Name << "\n\n";
+    Closed = true;
   }
-  ~IfDefEmitter() { OS << "\n#endif // " << Name << "\n\n"; }
 
 private:
   std::string Name;
   raw_ostream &OS;
+  bool LateUndef;
+  bool Closed = false;
 };
 
 // Simple RAII helper for emitting header include guard (ifndef-define-endif).
@@ -43,11 +62,20 @@ class IncludeGuardEmitter {
     OS << "#ifndef " << Name << "\n"
        << "#define " << Name << "\n\n";
   }
-  ~IncludeGuardEmitter() { OS << "\n#endif // " << Name << "\n"; }
+  ~IncludeGuardEmitter() { close(); }
+
+  // Explicit function to close the ifdef scopes.
+  void close() {
+    if (Closed)
+      return;
+    OS << "\n#endif // " << Name << "\n\n";
+    Closed = true;
+  }
 
 private:
   std::string Name;
   raw_ostream &OS;
+  bool Closed = false;
 };
 
 // Simple RAII helper for emitting namespace scope. Name can be a single
@@ -65,7 +93,9 @@ class NamespaceEmitter {
 
   // Explicit function to close the namespace scopes.
   void close() {
-    if (!Closed && !Name.empty())
+    if (Closed)
+      return;
+    if (!Name.empty())
       OS << "\n} // namespace " << Name << "\n";
     Closed = true;
   }
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 2f15cc8c76548..ae0431e79e1bc 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
@@ -75,7 +76,15 @@ class SubtargetEmitter : TargetFeaturesEmitter {
   CodeGenTarget TGT;
   CodeGenSchedModels &SchedModels;
 
+  FeatureMapTy emitEnums(raw_ostream &OS);
   void emitSubtargetInfoMacroCalls(raw_ostream &OS);
+  std::tuple<unsigned, unsigned, unsigned>
+  emitMCDesc(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  void emitTargetDesc(raw_ostream &OS);
+  void emitHeader(raw_ostream &OS);
+  void emitCtor(raw_ostream &OS, unsigned NumNames, unsigned NumFeatures,
+                unsigned NumProcs);
+
   unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   unsigned cpuNames(raw_ostream &OS);
@@ -141,7 +150,9 @@ class SubtargetEmitter : TargetFeaturesEmitter {
 /// Emit some information about the SubtargetFeature as calls to a macro so
 /// that they can be used from C++.
 void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {
-  OS << "\n#ifdef GET_SUBTARGETINFO_MACRO\n";
+  // Undef the GET_SUBTARGETINFO_MACRO macro at the end of the scope since it's
+  // used within the scope.
+  IfDefEmitter IfDefMacro(OS, "GET_SUBTARGETINFO_MACRO", /*LateUndef=*/true);
 
   std::vector<const Record *> FeatureList =
       Records.getAllDerivedDefinitions("SubtargetFeature");
@@ -167,14 +178,6 @@ void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {
     OS << "GET_SUBTARGETINFO_MACRO(" << FieldName << ", " << Default << ", "
        << Getter << ")\n";
   }
-  OS << "#undef GET_SUBTARGETINFO_MACRO\n";
-  OS << "#endif // GET_SUBTARGETINFO_MACRO\n\n";
-
-  OS << "\n#ifdef GET_SUBTARGETINFO_MC_DESC\n";
-  OS << "#undef GET_SUBTARGETINFO_MC_DESC\n\n";
-
-  if (Target == "AArch64")
-    OS << "#include \"llvm/TargetParser/AArch64TargetParser.h\"\n\n";
 }
 
 //
@@ -440,26 +443,24 @@ void SubtargetEmitter::emitStageAndOperandCycleData(
       continue;
 
     StringRef Name = ProcModel.ItinsDef->getName();
-    OS << "\n// Functional units for \"" << Name << "\"\n"
-       << "namespace " << Name << "FU {\n";
-
-    for (const auto &[Idx, FU] : enumerate(FUs))
-      OS << "  const InstrStage::FuncUnits " << FU->getName() << " = 1ULL << "
-         << Idx << ";\n";
+    {
+      OS << "\n// Functional units for \"" << Name << "\"\n";
+      NamespaceEmitter FUNamespace(OS, (Name + Twine("FU")).str());
 
-    OS << "} // end namespace " << Name << "FU\n";
+      for (const auto &[Idx, FU] : enumerate(FUs))
+        OS << "  const InstrStage::FuncUnits " << FU->getName() << " = 1ULL << "
+           << Idx << ";\n";
+    }
 
     ConstRecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP");
     if (BPs.empty())
       continue;
-    OS << "\n// Pipeline forwarding paths for itineraries \"" << Name << "\"\n"
-       << "namespace " << Name << "Bypass {\n";
+    OS << "\n// Pipeline forwarding paths for itineraries \"" << Name << "\"\n";
+    NamespaceEmitter BypassNamespace(OS, (Name + Twine("Bypass")).str());
 
     OS << "  const unsigned NoBypass = 0;\n";
     for (const auto &[Idx, BP] : enumerate(BPs))
       OS << "  const unsigned " << BP->getName() << " = 1 << " << Idx << ";\n";
-
-    OS << "} // end namespace " << Name << "Bypass\n";
   }
 
   // Begin stages table
@@ -1940,13 +1941,14 @@ void SubtargetEmitter::parseFeaturesFunction(raw_ostream &OS) {
 }
 
 void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
-  OS << "namespace " << Target << "_MC {\n"
-     << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,\n"
-     << "    const MCInst *MI, const MCInstrInfo *MCII, "
-     << "const MCSubtargetInfo &STI, unsigned CPUID) {\n";
-  emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);
-  OS << "}\n";
-  OS << "} // end namespace " << Target << "_MC\n\n";
+  {
+    NamespaceEmitter NS(OS, (Target + Twine("_MC")).str());
+    OS << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,\n"
+       << "    const MCInst *MI, const MCInstrInfo *MCII, "
+       << "const MCSubtargetInfo &STI, unsigned CPUID) {\n";
+    emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);
+    OS << "}\n";
+  }
 
   OS << "struct " << Target
      << "GenMCSubtargetInfo : public MCSubtargetInfo {\n";
@@ -1982,46 +1984,37 @@ void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
 }
 
 void SubtargetEmitter::emitMcInstrAnalysisPredicateFunctions(raw_ostream &OS) {
-  OS << "\n#ifdef GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS\n";
-  OS << "#undef GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS\n\n";
-
   STIPredicateExpander PE(Target, /*Indent=*/0);
-  PE.setExpandForMC(true);
-  PE.setByRef(true);
-  for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates())
-    PE.expandSTIPredicate(OS, Fn);
-
-  OS << "#endif // GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS\n\n";
 
-  OS << "\n#ifdef GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS\n";
-  OS << "#undef GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS\n\n";
+  {
+    IfDefEmitter IfDefDecls(OS, "GET_STIPREDICATE_DECLS_FOR_MC_ANALYSIS");
+    PE.setExpandForMC(true);
+    PE.setByRef(true);
+    for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates())
+      PE.expandSTIPredicate(OS, Fn);
+  }
 
+  IfDefEmitter IfDefDefs(OS, "GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS");
   std::string ClassPrefix = Target + "MCInstrAnalysis";
   PE.setExpandDefinition(true);
   PE.setClassPrefix(ClassPrefix);
   for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates())
     PE.expandSTIPredicate(OS, Fn);
-
-  OS << "#endif // GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS\n\n";
 }
 
-//
-// SubtargetEmitter::run - Main subtarget enumeration emitter.
-//
-void SubtargetEmitter::run(raw_ostream &OS) {
-  emitSourceFileHeader("Subtarget Enumeration Source Fragment", OS);
-
-  OS << "\n#ifdef GET_SUBTARGETINFO_ENUM\n";
-  OS << "#undef GET_SUBTARGETINFO_ENUM\n\n";
-
-  OS << "namespace llvm {\n";
-  auto FeatureMap = enumeration(OS);
-  OS << "} // end namespace llvm\n\n";
-  OS << "#endif // GET_SUBTARGETINFO_ENUM\n\n";
+FeatureMapTy SubtargetEmitter::emitEnums(raw_ostream &OS) {
+  IfDefEmitter IfDef(OS, "GET_SUBTARGETINFO_ENUM");
+  NamespaceEmitter NS(OS, "llvm");
+  return enumeration(OS);
+}
 
-  emitSubtargetInfoMacroCalls(OS);
+std::tuple<unsigned, unsigned, unsigned>
+SubtargetEmitter::emitMCDesc(raw_ostream &OS, const FeatureMapTy &FeatureMap) {
+  IfDefEmitter IfDef(OS, "GET_SUBTARGETINFO_MC_DESC");
+  if (Target == "AArch64")
+    OS << "#include \"llvm/TargetParser/AArch64TargetParser.h\"\n\n";
+  NamespaceEmitter LlvmNS(OS, "llvm");
 
-  OS << "namespace llvm {\n";
   unsigned NumFeatures = featureKeyValues(OS, FeatureMap);
   OS << "\n";
   emitSchedModel(OS);
@@ -2067,13 +2060,11 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << "nullptr, nullptr, nullptr";
   }
   OS << ");\n}\n\n";
+  return {NumNames, NumFeatures, NumProcs};
+}
 
-  OS << "} // end namespace llvm\n\n";
-
-  OS << "#endif // GET_SUBTARGETINFO_MC_DESC\n\n";
-
-  OS << "\n#ifdef GET_SUBTARGETINFO_TARGET_DESC\n";
-  OS << "#undef GET_SUBTARGETINFO_TARGET_DESC\n\n";
+void SubtargetEmitter::emitTargetDesc(raw_ostream &OS) {
+  IfDefEmitter IfDef(OS, "GET_SUBTARGETINFO_TARGET_DESC");
 
   OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
   OS << "#include \"llvm/Support/Debug.h\"\n";
@@ -2081,21 +2072,21 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   if (Target == "AArch64")
     OS << "#include \"llvm/TargetParser/AArch64TargetParser.h\"\n\n";
   parseFeaturesFunction(OS);
+}
 
-  OS << "#endif // GET_SUBTARGETINFO_TARGET_DESC\n\n";
-
+void SubtargetEmitter::emitHeader(raw_ostream &OS) {
   // Create a TargetSubtargetInfo subclass to hide the MC layer initialization.
-  OS << "\n#ifdef GET_SUBTARGETINFO_HEADER\n";
-  OS << "#undef GET_SUBTARGETINFO_HEADER\n\n";
+  IfDefEmitter IfDef(OS, "GET_SUBTARGETINFO_HEADER");
+  NamespaceEmitter LLVMNS(OS, "llvm");
 
   std::string ClassName = Target + "GenSubtargetInfo";
-  OS << "namespace llvm {\n";
   OS << "class DFAPacketizer;\n";
-  OS << "namespace " << Target << "_MC {\n"
-     << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,"
-     << " const MCInst *MI, const MCInstrInfo *MCII, "
-     << "const MCSubtargetInfo &STI, unsigned CPUID);\n"
-     << "} // end namespace " << Target << "_MC\n\n";
+  {
+    NamespaceEmitter MCNS(OS, (Target + Twine("_MC")).str());
+    OS << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,"
+       << " const MCInst *MI, const MCInstrInfo *MCII, "
+       << "const MCSubtargetInfo &STI, unsigned CPUID);\n";
+  }
   OS << "struct " << ClassName << " : public TargetSubtargetInfo {\n"
      << "  explicit " << ClassName << "(const Triple &TT, StringRef CPU, "
      << "StringRef TuneCPU, StringRef FS);\n"
@@ -2140,17 +2131,15 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   PE.setByRef(false);
   for (const STIPredicateFunction &Fn : SchedModels.getSTIPredicates())
     PE.expandSTIPredicate(OS, Fn);
+  OS << "};\n";
+}
 
-  OS << "};\n"
-     << "} // end namespace llvm\n\n";
-
-  OS << "#endif // GET_SUBTARGETINFO_HEADER\n\n";
-
-  OS << "\n#ifdef GET_SUBTARGETINFO_CTOR\n";
-  OS << "#undef GET_SUBTARGETINFO_CTOR\n\n";
-
+void SubtargetEmitter::emitCtor(raw_ostream &OS, unsigned NumNames,
+                                unsigned NumFeatures, unsigned NumProcs) {
+  IfDefEmitter IfDef(OS, "GET_SUBTARGETINFO_CTOR");
   OS << "#include \"llvm/CodeGen/TargetSchedule.h\"\n\n";
-  OS << "namespace llvm {\n";
+
+  NamespaceEmitter LLVMNS(OS, "llvm");
   OS << "extern const llvm::StringRef " << Target << "Names[];\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n";
   OS << "extern const llvm::SubtargetSubTypeKV " << Target << "SubTypeKV[];\n";
@@ -2167,6 +2156,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << "extern const unsigned " << Target << "ForwardingPaths[];\n";
   }
 
+  std::string ClassName = Target + "GenSubtargetInfo";
   OS << ClassName << "::" << ClassName << "(const Triple &TT, StringRef CPU, "
      << "StringRef TuneCPU, StringRef FS)\n";
 
@@ -2204,11 +2194,20 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   emitSchedModelHelpers(ClassName, OS);
   emitHwModeCheck(ClassName, OS, /*IsMC=*/false);
   emitGetMacroFusions(ClassName, OS);
+}
 
-  OS << "} // end namespace llvm\n\n";
-
-  OS << "#endif // GET_SUBTARGETINFO_CTOR\n\n";
+//
+// SubtargetEmitter::run - Main subtarget enumeration emitter.
+//
+void SubtargetEmitter::run(raw_ostream &OS) {
+  emitSourceFileHeader("Subtarget Enumeration Source Fragment", OS);
 
+  auto FeatureMap = emitEnums(OS);
+  emitSubtargetInfoMacroCalls(OS);
+  auto [NumNames, NumFeatures, NumProcs] = emitMCDesc(OS, FeatureMap);
+  emitTargetDesc(OS);
+  emitHeader(OS);
+  emitCtor(OS, NumNames, NumFeatures, NumProcs);
   emitMcInstrAnalysisPredicateFunctions(OS);
 }
 

From e1324a93778624661345229f3acfe258bc495d95 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Thu, 13 Nov 2025 16:05:33 +0000
Subject: [PATCH 04/25] Revert "[Flang][OpenMP] Update declare mapper lookup
 via use-module" (#167896)

Reverts llvm/llvm-project#163860
---
 flang/include/flang/Lower/OpenMP.h            |  7 ---
 flang/include/flang/Semantics/symbol.h        | 20 +------
 flang/lib/Lower/Bridge.cpp                    |  7 ---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    | 12 ++--
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 59 ++-----------------
 flang/lib/Semantics/mod-file.cpp              | 12 ----
 flang/lib/Semantics/resolve-names.cpp         | 47 ++++-----------
 flang/lib/Semantics/symbol.cpp                |  6 +-
 flang/test/Lower/OpenMP/declare-mapper.f90    | 26 +-------
 flang/test/Parser/OpenMP/map-modifiers.f90    |  7 ++-
 .../OpenMP/declare-mapper-modfile.f90         | 14 -----
 .../OpenMP/declare-mapper-symbols.f90         |  5 +-
 .../Semantics/OpenMP/map-clause-symbols.f90   | 12 ++--
 13 files changed, 37 insertions(+), 197 deletions(-)
 delete mode 100644 flang/test/Semantics/OpenMP/declare-mapper-modfile.f90

diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h
index 962abd8952073..df01a7b82c66c 100644
--- a/flang/include/flang/Lower/OpenMP.h
+++ b/flang/include/flang/Lower/OpenMP.h
@@ -97,13 +97,6 @@ bool markOpenMPDeferredDeclareTargetFunctions(
     AbstractConverter &);
 void genOpenMPRequires(mlir::Operation *, const Fortran::semantics::Symbol *);
 
-// Materialize omp.declare_mapper ops for mapper declarations found in
-// imported modules. If \p scope is null, materialize for the whole
-// semantics global scope; otherwise, operate recursively starting at \p scope.
-void materializeOpenMPDeclareMappers(
-    Fortran::lower::AbstractConverter &, Fortran::semantics::SemanticsContext &,
-    const Fortran::semantics::Scope *scope = nullptr);
-
 } // namespace lower
 } // namespace Fortran
 
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 95efe1ae2bd5e..cb27d544ed9f5 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -777,24 +777,6 @@ class UserReductionDetails {
   DeclVector declList_;
 };
 
-// Used for OpenMP DECLARE MAPPER, it holds the declaration constructs
-// so they can be serialized into module files and later re-parsed when
-// USE-associated.
-class MapperDetails {
-public:
-  using DeclVector = std::vector<const parser::OpenMPDeclarativeConstruct *>;
-
-  MapperDetails() = default;
-
-  void AddDecl(const parser::OpenMPDeclarativeConstruct *decl) {
-    declList_.emplace_back(decl);
-  }
-  const DeclVector &GetDeclList() const { return declList_; }
-
-private:
-  DeclVector declList_;
-};
-
 class UnknownDetails {};
 
 using Details = std::variant<UnknownDetails, MainProgramDetails, ModuleDetails,
@@ -802,7 +784,7 @@ using Details = std::variant<UnknownDetails, MainProgramDetails, ModuleDetails,
     ObjectEntityDetails, ProcEntityDetails, AssocEntityDetails,
     DerivedTypeDetails, UseDetails, UseErrorDetails, HostAssocDetails,
     GenericDetails, ProcBindingDetails, NamelistDetails, CommonBlockDetails,
-    TypeParamDetails, MiscDetails, UserReductionDetails, MapperDetails>;
+    TypeParamDetails, MiscDetails, UserReductionDetails>;
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Details &);
 std::string DetailsToString(const Details &);
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 5bfcff310c232..20e85a940b182 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -448,13 +448,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       }
     });
 
-    // Ensure imported OpenMP declare mappers are materialized at module
-    // scope before lowering any constructs that may reference them.
-    createBuilderOutsideOfFuncOpAndDo([&]() {
-      Fortran::lower::materializeOpenMPDeclareMappers(
-          *this, bridge.getSemanticsContext());
-    });
-
     // Create definitions of intrinsic module constants.
     createBuilderOutsideOfFuncOpAndDo(
         [&]() { createIntrinsicModuleDefinitions(pft); });
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 2dd89168ca098..872f31fe45cca 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1397,14 +1397,10 @@ bool ClauseProcessor::processMap(
     }
     if (mappers) {
       assert(mappers->size() == 1 && "more than one mapper");
-      const semantics::Symbol *mapperSym = mappers->front().v.id().symbol;
-      mapperIdName = mapperSym->name().ToString();
-      if (mapperIdName != "default") {
-        // Mangle with the ultimate owner so that use-associated mapper
-        // identifiers resolve to the same symbol as their defining scope.
-        const semantics::Symbol &ultimate = mapperSym->GetUltimate();
-        mapperIdName = converter.mangleName(mapperIdName, ultimate.owner());
-      }
+      mapperIdName = mappers->front().v.id().symbol->name().ToString();
+      if (mapperIdName != "default")
+        mapperIdName = converter.mangleName(
+            mapperIdName, mappers->front().v.id().symbol->owner());
     }
 
     processMapObjects(stmtCtx, clauseLocation,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index fe80c46c23d06..4048aeea37b92 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3553,10 +3553,10 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
 }
 
-static void genOpenMPDeclareMapperImpl(
-    lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
-    const parser::OpenMPDeclareMapperConstruct &construct,
-    const semantics::Symbol *mapperSymOpt = nullptr) {
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OpenMPDeclareMapperConstruct &construct) {
   mlir::Location loc = converter.genLocation(construct.source);
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   const parser::OmpArgumentList &args = construct.v.Arguments();
@@ -3572,17 +3572,8 @@ static void genOpenMPDeclareMapperImpl(
          "Expected derived type");
 
   std::string mapperNameStr = mapperName;
-  if (mapperSymOpt && mapperNameStr != "default") {
-    mapperNameStr = converter.mangleName(mapperNameStr, mapperSymOpt->owner());
-  } else if (auto *sym =
-                 converter.getCurrentScope().FindSymbol(mapperNameStr)) {
+  if (auto *sym = converter.getCurrentScope().FindSymbol(mapperNameStr))
     mapperNameStr = converter.mangleName(mapperNameStr, sym->owner());
-  }
-
-  // If the mapper op already exists (e.g., created by regular lowering or by
-  // materialization of imported mappers), do not recreate it.
-  if (converter.getModuleOp().lookupSymbol(mapperNameStr))
-    return;
 
   // Save current insertion point before moving to the module scope to create
   // the DeclareMapperOp
@@ -3605,13 +3596,6 @@ static void genOpenMPDeclareMapperImpl(
   mlir::omp::DeclareMapperInfoOp::create(firOpBuilder, loc, clauseOps.mapVars);
 }
 
-static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-                   semantics::SemanticsContext &semaCtx,
-                   lower::pft::Evaluation &eval,
-                   const parser::OpenMPDeclareMapperConstruct &construct) {
-  genOpenMPDeclareMapperImpl(converter, semaCtx, construct);
-}
-
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
@@ -4247,36 +4231,3 @@ void Fortran::lower::genOpenMPRequires(mlir::Operation *mod,
     offloadMod.setRequires(mlirFlags);
   }
 }
-
-// Walk scopes and materialize omp.declare_mapper ops for mapper declarations
-// found in imported modules. If \p scope is null, start from the global scope.
-void Fortran::lower::materializeOpenMPDeclareMappers(
-    Fortran::lower::AbstractConverter &converter,
-    semantics::SemanticsContext &semaCtx, const semantics::Scope *scope) {
-  const semantics::Scope &root = scope ? *scope : semaCtx.globalScope();
-
-  // Recurse into child scopes first (modules, submodules, etc.).
-  for (const semantics::Scope &child : root.children())
-    materializeOpenMPDeclareMappers(converter, semaCtx, &child);
-
-  // Only consider module scopes to avoid duplicating local constructs.
-  if (!root.IsModule())
-    return;
-
-  // Only materialize for modules coming from mod files to avoid duplicates.
-  if (!root.symbol() || !root.symbol()->test(semantics::Symbol::Flag::ModFile))
-    return;
-
-  // Scan symbols in this module scope for MapperDetails.
-  for (auto &it : root) {
-    const semantics::Symbol &sym = *it.second;
-    if (auto *md = sym.detailsIf<semantics::MapperDetails>()) {
-      for (const auto *decl : md->GetDeclList()) {
-        if (const auto *mapperDecl =
-                std::get_if<parser::OpenMPDeclareMapperConstruct>(&decl->u)) {
-          genOpenMPDeclareMapperImpl(converter, semaCtx, *mapperDecl, &sym);
-        }
-      }
-    }
-  }
-}
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 840b98dd42139..b419864f73b8e 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -59,7 +59,6 @@ static void PutBound(llvm::raw_ostream &, const Bound &);
 static void PutShapeSpec(llvm::raw_ostream &, const ShapeSpec &);
 static void PutShape(
     llvm::raw_ostream &, const ArraySpec &, char open, char close);
-static void PutMapper(llvm::raw_ostream &, const Symbol &, SemanticsContext &);
 
 static llvm::raw_ostream &PutAttr(llvm::raw_ostream &, Attr);
 static llvm::raw_ostream &PutType(llvm::raw_ostream &, const DeclTypeSpec &);
@@ -939,7 +938,6 @@ void ModFileWriter::PutEntity(llvm::raw_ostream &os, const Symbol &symbol) {
           [&](const ProcEntityDetails &) { PutProcEntity(os, symbol); },
           [&](const TypeParamDetails &) { PutTypeParam(os, symbol); },
           [&](const UserReductionDetails &) { PutUserReduction(os, symbol); },
-          [&](const MapperDetails &) { PutMapper(decls_, symbol, context_); },
           [&](const auto &) {
             common::die("PutEntity: unexpected details: %s",
                 DetailsToString(symbol.details()).c_str());
@@ -1103,16 +1101,6 @@ void ModFileWriter::PutUserReduction(
   }
 }
 
-static void PutMapper(
-    llvm::raw_ostream &os, const Symbol &symbol, SemanticsContext &context) {
-  const auto &details{symbol.get<MapperDetails>()};
-  // Emit each saved DECLARE MAPPER construct as-is, so that consumers of the
-  // module can reparse it and recreate the mapper symbol and semantics state.
-  for (const auto *decl : details.GetDeclList()) {
-    Unparse(os, *decl, context.langOptions());
-  }
-}
-
 void PutInit(llvm::raw_ostream &os, const Symbol &symbol, const MaybeExpr &init,
     const parser::Expr *unanalyzed, SemanticsContext &context) {
   if (IsNamedConstant(symbol) || symbol.owner().IsDerivedType()) {
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index ea0d38c573af9..09ec951a422ca 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1852,25 +1852,21 @@ bool OmpVisitor::Pre(const parser::OmpMapClause &x) {
       // TODO: Do we need a specific flag or type here, to distinghuish against
       // other ConstructName things? Leaving this for the full implementation
       // of mapper lowering.
-      auto &ultimate{symbol->GetUltimate()};
-      auto *misc{ultimate.detailsIf<MiscDetails>()};
-      auto *md{ultimate.detailsIf<MapperDetails>()};
-      if (!md && (!misc || misc->kind() != MiscDetails::Kind::ConstructName))
+      auto *misc{symbol->detailsIf<MiscDetails>()};
+      if (!misc || misc->kind() != MiscDetails::Kind::ConstructName)
         context().Say(mapper->v.source,
             "Name '%s' should be a mapper name"_err_en_US, mapper->v.source);
       else
         mapper->v.symbol = symbol;
     } else {
-      // Allow the special 'default' mapper identifier without prior
-      // declaration so lowering can recognize and handle it. Emit an
-      // error for any other missing mapper identifier.
-      if (mapper->v.source.ToString() == "default") {
-        mapper->v.symbol = &MakeSymbol(
-            mapper->v, MiscDetails{MiscDetails::Kind::ConstructName});
-      } else {
-        context().Say(
-            mapper->v.source, "'%s' not declared"_err_en_US, mapper->v.source);
-      }
+      mapper->v.symbol =
+          &MakeSymbol(mapper->v, MiscDetails{MiscDetails::Kind::ConstructName});
+      // TODO: When completing the implementation, we probably want to error if
+      // the symbol is not declared, but right now, testing that the TODO for
+      // OmpMapClause happens is obscured by the TODO for declare mapper, so
+      // leaving this out. Remove the above line once the declare mapper is
+      // implemented. context().Say(mapper->v.source, "'%s' not
+      // declared"_err_en_US, mapper->v.source);
     }
   }
   return true;
@@ -1884,15 +1880,8 @@ void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec,
   // the type has been fully processed.
   BeginDeclTypeSpec();
   auto &mapperName{std::get<std::string>(spec.t)};
-  // Create or update the mapper symbol with MapperDetails and
-  // keep track of the declarative construct for module emission.
-  Symbol &mapperSym{MakeSymbol(parser::CharBlock(mapperName), Attrs{})};
-  if (auto *md{mapperSym.detailsIf<MapperDetails>()}) {
-    md->AddDecl(declaratives_.back());
-  } else if (mapperSym.has<UnknownDetails>() || mapperSym.has<MiscDetails>()) {
-    mapperSym.set_details(MapperDetails{});
-    mapperSym.get<MapperDetails>().AddDecl(declaratives_.back());
-  }
+  MakeSymbol(parser::CharBlock(mapperName), Attrs{},
+      MiscDetails{MiscDetails::Kind::ConstructName});
   PushScope(Scope::Kind::OtherConstruct, nullptr);
   Walk(std::get<parser::TypeSpec>(spec.t));
   auto &varName{std::get<parser::Name>(spec.t)};
@@ -3622,20 +3611,10 @@ void ModuleVisitor::Post(const parser::UseStmt &x) {
           rename.u);
     }
     for (const auto &[name, symbol] : *useModuleScope_) {
-      // Default USE imports public names, excluding intrinsic-only and most
-      // miscellaneous details. Allow OpenMP mapper identifiers represented
-      // as MapperDetails, and also legacy MiscDetails::ConstructName.
-      bool isMapper{symbol->has<MapperDetails>()};
-      if (!isMapper) {
-        if (const auto *misc{symbol->detailsIf<MiscDetails>()}) {
-          isMapper = misc->kind() == MiscDetails::Kind::ConstructName;
-        }
-      }
       if (symbol->attrs().test(Attr::PUBLIC) && !IsUseRenamed(symbol->name()) &&
           (!symbol->implicitAttrs().test(Attr::INTRINSIC) ||
               symbol->has<UseDetails>()) &&
-          (!symbol->has<MiscDetails>() || isMapper) &&
-          useNames.count(name) == 0) {
+          !symbol->has<MiscDetails>() && useNames.count(name) == 0) {
         SourceName location{x.moduleName.source};
         if (auto *localSymbol{FindInScope(name)}) {
           DoAddUse(location, localSymbol->name(), *localSymbol, *symbol);
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index ed0715a422e78..0ec44b7c40491 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -338,8 +338,7 @@ std::string DetailsToString(const Details &details) {
           [](const TypeParamDetails &) { return "TypeParam"; },
           [](const MiscDetails &) { return "Misc"; },
           [](const AssocEntityDetails &) { return "AssocEntity"; },
-          [](const UserReductionDetails &) { return "UserReductionDetails"; },
-          [](const MapperDetails &) { return "MapperDetails"; }},
+          [](const UserReductionDetails &) { return "UserReductionDetails"; }},
       details);
 }
 
@@ -380,7 +379,6 @@ bool Symbol::CanReplaceDetails(const Details &details) const {
             [&](const UserReductionDetails &) {
               return has<UserReductionDetails>();
             },
-            [&](const MapperDetails &) { return has<MapperDetails>(); },
             [](const auto &) { return false; },
         },
         details);
@@ -687,8 +685,6 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
               DumpType(os, type);
             }
           },
-          // Avoid recursive streaming for MapperDetails; nothing more to dump
-          [&](const MapperDetails &) {},
           [&](const auto &x) { os << x; },
       },
       details);
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
index e4c010156ee39..c389d0ff4bd15 100644
--- a/flang/test/Lower/OpenMP/declare-mapper.f90
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -6,9 +6,7 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-3.f90 -o - | FileCheck %t/omp-declare-mapper-3.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-4.f90 -o - | FileCheck %t/omp-declare-mapper-4.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-5.f90 -o - | FileCheck %t/omp-declare-mapper-5.f90
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-7.mod.f90 -o - >/dev/null
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-7.use.f90 -o - | FileCheck %t/omp-declare-mapper-7.use.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90
 
 !--- omp-declare-mapper-1.f90
 subroutine declare_mapper_1
@@ -303,25 +301,3 @@ subroutine declare_mapper_nested_parent
     r%real_arr = r%base_arr(1) + r%inner%deep_arr(1)
   !$omp end target
 end subroutine declare_mapper_nested_parent
-
-!--- omp-declare-mapper-7.mod.f90
-! Module with DECLARE MAPPER to be compiled separately
-module m_mod
-  implicit none
-  type :: mty
-    integer :: x
-  end type mty
-  !$omp declare mapper(mymap : mty :: v) map(tofrom: v%x)
-end module m_mod
-
-!--- omp-declare-mapper-7.use.f90
-! Consumer program that USEs the module and applies the mapper by name.
-! CHECK: %{{.*}} = omp.map.info {{.*}} mapper(@{{.*mymap}}) {{.*}} {name = "a"}
-program use_module_mapper
-  use m_mod
-  implicit none
-  type(mty) :: a
-  !$omp target map(mapper(mymap) : a)
-    a%x = 42
-  !$omp end target
-end program use_module_mapper
diff --git a/flang/test/Parser/OpenMP/map-modifiers.f90 b/flang/test/Parser/OpenMP/map-modifiers.f90
index 7d9b8856ac833..83662b70f08f5 100644
--- a/flang/test/Parser/OpenMP/map-modifiers.f90
+++ b/flang/test/Parser/OpenMP/map-modifiers.f90
@@ -320,7 +320,7 @@ subroutine f21(x, y)
   integer :: x(10)
   integer :: y
   integer, parameter :: p = 23
-  !$omp target map(mapper(default), from: x)
+  !$omp target map(mapper(xx), from: x)
   x = x + 1
   !$omp end target
 end
@@ -329,7 +329,7 @@ subroutine f21(x, y)
 !UNPARSE:  INTEGER x(10_4)
 !UNPARSE:  INTEGER y
 !UNPARSE:  INTEGER, PARAMETER :: p = 23_4
-!UNPARSE: !$OMP TARGET  MAP(MAPPER(DEFAULT), FROM: X)
+!UNPARSE: !$OMP TARGET  MAP(MAPPER(XX), FROM: X)
 !UNPARSE:   x=x+1_4
 !UNPARSE: !$OMP END TARGET
 !UNPARSE: END SUBROUTINE
@@ -337,7 +337,7 @@ subroutine f21(x, y)
 !PARSE-TREE: OmpBeginDirective
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
-!PARSE-TREE: | | Modifier -> OmpMapper -> Name = 'default'
+!PARSE-TREE: | | Modifier -> OmpMapper -> Name = 'xx'
 !PARSE-TREE: | | Modifier -> OmpMapType -> Value = From
 !PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
 
@@ -375,3 +375,4 @@ subroutine f22(x)
 !PARSE-TREE: | | SectionSubscript -> Integer -> Expr = 'i'
 !PARSE-TREE: | | | Designator -> DataRef -> Name = 'i'
 !PARSE-TREE: | bool = 'true'
+
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-modfile.f90 b/flang/test/Semantics/OpenMP/declare-mapper-modfile.f90
deleted file mode 100644
index 480f87bc0f8e9..0000000000000
--- a/flang/test/Semantics/OpenMP/declare-mapper-modfile.f90
+++ /dev/null
@@ -1,14 +0,0 @@
-! RUN: split-file %s %t
-! RUN: %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=50 -module-dir %t %t/m.f90
-! RUN: cat %t/m.mod | FileCheck --ignore-case %s
-
-!--- m.f90
-module m
-  implicit none
-  type :: t
-    integer :: x
-  end type t
-  !$omp declare mapper(mymap : t :: v) map(v%x)
-end module m
-
-!CHECK: !$OMP DECLARE MAPPER(mymap:t::v) MAP(v%x)
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
index 5d77540aa6453..e57a5c0c1cea6 100644
--- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
+++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
@@ -11,9 +11,9 @@ program main
    !$omp declare mapper(ty :: maptwo) map(maptwo, maptwo%x)
 
 !! Note, symbols come out in their respective scope, but not in declaration order.
-!CHECK: mymapper: MapperDetails
+!CHECK: mymapper: Misc ConstructName
 !CHECK: ty: DerivedType components: x
-!CHECK: ty.omp.default.mapper: MapperDetails
+!CHECK: ty.omp.default.mapper: Misc ConstructName
 !CHECK: DerivedType scope: ty
 !CHECK: OtherConstruct scope:
 !CHECK: mapped (OmpMapToFrom) {{.*}} ObjectEntity type: TYPE(ty)
@@ -21,3 +21,4 @@ program main
 !CHECK: maptwo (OmpMapToFrom) {{.*}} ObjectEntity type: TYPE(ty)
 
 end program main
+
diff --git a/flang/test/Semantics/OpenMP/map-clause-symbols.f90 b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
index 3b723e817ce87..1d6315b4a2312 100644
--- a/flang/test/Semantics/OpenMP/map-clause-symbols.f90
+++ b/flang/test/Semantics/OpenMP/map-clause-symbols.f90
@@ -1,16 +1,14 @@
 ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s
 program main
 !CHECK-LABEL:  MainProgram scope: MAIN
-  type ty
-    real(4) :: x
-  end type ty
-  !$omp declare mapper(xx : ty :: v) map(v)
   integer, parameter :: n = 256
-  type(ty) :: a(256)
+  real(8) :: a(256)
   !$omp target map(mapper(xx), from:a)
   do i=1,n
-     a(i)%x = 4.2
+     a(i) = 4.2
   end do
   !$omp end target
-!CHECK:    xx: MapperDetails
+!CHECK:    OtherConstruct scope: size=0 alignment=1 sourceRange=74 bytes
+!CHECK:    OtherClause scope: size=0 alignment=1 sourceRange=0 bytes
+!CHECK:    xx: Misc ConstructName
 end program main

From 6a0ba8b7a4176bbc78f4dcff4f21bae1e2097d67 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 13 Nov 2025 17:05:48 +0100
Subject: [PATCH 05/25] [CIR] Prepare a 'this' for CXXDefaultInitExprs
 (#165994)

Prepare a 'this' for CXXDefaultInitExprs
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  3 +++
 clang/test/CIR/CodeGen/struct-init.cpp        | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index dcded94b012f4..872fc8d14ad95 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -839,6 +839,9 @@ void AggExprEmitter::visitCXXParenListOrInitListExpr(
     }
   }
 
+  // Prepare a 'this' for CXXDefaultInitExprs.
+  CIRGenFunction::FieldConstructionScope fcScope(cgf, dest.getAddress());
+
   LValue destLV = cgf.makeAddrLValue(dest.getAddress(), e->getType());
 
   if (record->isUnion()) {
diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp
index 8f146684ffb10..f5c013a599a40 100644
--- a/clang/test/CIR/CodeGen/struct-init.cpp
+++ b/clang/test/CIR/CodeGen/struct-init.cpp
@@ -205,3 +205,26 @@ void init_expr(int a, int b, int c) {
 // OGCG:   %[[C_PLUS_THREE:.*]] = add nsw i32 %[[C]], 3
 // OGCG:   store i32 %[[C_PLUS_THREE]], ptr %[[S_C]]
 // OGCG:   ret void
+
+void cxx_default_init_with_struct_field() {
+  struct Parent {
+    int getA();
+    int a = getA();
+  };
+  Parent p = Parent{};
+}
+
+// CIR: %[[P_ADDR:.*]] = cir.alloca !rec_Parent, !cir.ptr<!rec_Parent>, ["p", init]
+// CIR: %[[P_ELEM_0_PTR:.*]] = cir.get_member %[[P_ADDR]][0] {name = "a"} : !cir.ptr<!rec_Parent> -> !cir.ptr<!s32i>
+// CIR: %[[METHOD_CALL:.*]] = cir.call @_ZZ34cxx_default_init_with_struct_fieldvEN6Parent4getAEv(%[[P_ADDR]]) : (!cir.ptr<!rec_Parent>) -> !s32i
+// CIR: cir.store{{.*}} %[[METHOD_CALL]], %[[P_ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: %[[P_ADDR:.*]] = alloca %struct.Parent, i64 1, align 4
+// LLVM: %[[P_ELEM_0_PTR:.*]] = getelementptr %struct.Parent, ptr %[[P_ADDR]], i32 0, i32 0
+// LLVM: %[[METHOD_CALL:.*]] = call i32 @_ZZ34cxx_default_init_with_struct_fieldvEN6Parent4getAEv(ptr %[[P_ADDR]])
+// LLVM: store i32 %[[METHOD_CALL]], ptr %[[P_ELEM_0_PTR]], align 4
+
+// OGCG: %[[P_ADDR:.*]] = alloca %struct.Parent, align 4
+// OGCG: %[[P_ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.Parent, ptr %[[P_ADDR]], i32 0, i32 0
+// OGCG: %[[METHOD_CALL:.*]] = call noundef i32 @_ZZ34cxx_default_init_with_struct_fieldvEN6Parent4getAEv(ptr {{.*}} %[[P_ADDR]])
+// OGCG: store i32 %[[METHOD_CALL]], ptr %[[P_ELEM_0_PTR]], align 4

From a04c6b5512bf091b4eec6c4f7dbfaaf44b290906 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <buchner.ryan@gmail.com>
Date: Thu, 13 Nov 2025 08:12:40 -0800
Subject: [PATCH 06/25] [LV] Update
 LoopVectorizationPlanner::emitInvalidCostRemarks to handle reduction plans
 (#165913)

The TypeSwitch for extracting the Opcode now handles the `VPReductionRecipe` case.

Fixes #165359.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  3 +++
 .../LoopVectorize/AArch64/bug165359.ll        | 25 +++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/bug165359.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 835b0995cc4fc..f4629d22002dc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4018,6 +4018,9 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
             .Case<VPInterleaveRecipe>([](const VPInterleaveRecipe *R) {
               return R->getStoredValues().empty() ? Instruction::Load
                                                   : Instruction::Store;
+            })
+            .Case<VPReductionRecipe>([](const auto *R) {
+              return RecurrenceDescriptor::getOpcode(R->getRecurrenceKind());
             });
 
     // If the next recipe is different, or if there are no other pairs,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/bug165359.ll b/llvm/test/Transforms/LoopVectorize/AArch64/bug165359.ll
new file mode 100644
index 0000000000000..87320c547a757
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/bug165359.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s -passes=loop-vectorize -S -pass-remarks-analysis=loop-vectorize -disable-output &> %t
+; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s
+
+; CHECK-REMARKS: remark: <unknown>:0:0: Recipe with invalid costs prevented vectorization at VF=(vscale x 1): fadd
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define double @reduce_fail(i64 %loop_count, double %d0, ptr %ptr1) #0 {
+entry:
+  %d1 = load double, ptr %ptr1
+  br label %loop
+
+loop:
+  %acc0 = phi double [ %fadd0, %loop ], [ %d0, %entry ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fadd0 = fadd double %acc0, %d1
+  %iv.next = add nsw nuw i64 %iv, 1
+  %exit_cond = icmp eq i64 %iv.next, %loop_count
+  br i1 %exit_cond, label %loopexit, label %loop
+
+loopexit:
+  ret double %fadd0
+}
+
+attributes #0 = { "target-features"="+sve" }

From f7e652127772e9390ecd1fee9504c07435a9bb87 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 13 Nov 2025 16:26:01 +0000
Subject: [PATCH 07/25] [compiler-rt][ARM] Optimized mulsf3 and divsf3
 (#161546)

This commit adds optimized assembly versions of single-precision float
multiplication and division. Both functions are implemented in a style
that can be assembled as either of Arm and Thumb2; for multiplication, a
separate implementation is provided for Thumb1. Also, extensive new
tests are added for multiplication and division.

These implementations can be removed from the build by defining the
cmake variable COMPILER_RT_ARM_OPTIMIZED_FP=OFF.

Outlying parts of the functionality which are not on the fast path, such
as NaN handling and underflow, are handled in helper functions written
in C. These can be shared between the Arm/Thumb2 and Thumb1
implementations, and also reused by other optimized assembly functions
we hope to add in future.
---
 .../cmake/Modules/CheckAssemblerFlag.cmake    |  38 ++
 compiler-rt/lib/builtins/CMakeLists.txt       |  45 ++
 compiler-rt/lib/builtins/arm/divsf3.S         | 608 +++++++++++++++++
 compiler-rt/lib/builtins/arm/fnan2.c          |  42 ++
 compiler-rt/lib/builtins/arm/fnorm2.c         |  62 ++
 compiler-rt/lib/builtins/arm/funder.c         |  78 +++
 compiler-rt/lib/builtins/arm/mulsf3.S         | 309 +++++++++
 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S  | 251 +++++++
 compiler-rt/test/builtins/CMakeLists.txt      |   4 +
 compiler-rt/test/builtins/Unit/divsf3_test.c  | 503 +++++++++++---
 compiler-rt/test/builtins/Unit/mulsf3_test.c  | 616 ++++++++++++++++++
 11 files changed, 2461 insertions(+), 95 deletions(-)
 create mode 100644 compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake
 create mode 100644 compiler-rt/lib/builtins/arm/divsf3.S
 create mode 100644 compiler-rt/lib/builtins/arm/fnan2.c
 create mode 100644 compiler-rt/lib/builtins/arm/fnorm2.c
 create mode 100644 compiler-rt/lib/builtins/arm/funder.c
 create mode 100644 compiler-rt/lib/builtins/arm/mulsf3.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
 create mode 100644 compiler-rt/test/builtins/Unit/mulsf3_test.c

diff --git a/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake
new file mode 100644
index 0000000000000..49e8b8547c5cd
--- /dev/null
+++ b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake
@@ -0,0 +1,38 @@
+# Helper function to find out whether the assembler supports a particular
+# command-line flag. You'd like to use the standard check_compiler_flag(), but
+# that only supports a fixed list of languages, and ASM isn't one of them. So
+# we do it ourselves, by trying to assemble an empty source file.
+
+function(check_assembler_flag outvar flag)
+  if(NOT DEFINED "${outvar}")
+    if(NOT CMAKE_REQUIRED_QUIET)
+      message(CHECK_START "Checking for assembler flag ${flag}")
+    endif()
+
+    # Stop try_compile from attempting to link the result of the assembly, so
+    # that we don't depend on having a working linker, and also don't have to
+    # figure out what special symbol like _start needs to be defined in the
+    # test input.
+    #
+    # This change is made within the dynamic scope of this function, so
+    # CMAKE_TRY_COMPILE_TARGET_TYPE will be restored to its previous value on
+    # return.
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+
+    # Try to assemble an empty file with a .S name, using the provided flag.
+    try_compile(success
+      SOURCE_FROM_CONTENT "CheckAssemblerFlag.s" ""
+      COMPILE_DEFINITIONS ${flag}
+      NO_CACHE)
+
+    if(NOT CMAKE_REQUIRED_QUIET)
+      if(success)
+        message(CHECK_PASS "Accepted")
+        set(${outvar} 1 CACHE INTERNAL "Test assembler flag ${flag}")
+      else()
+        message(CHECK_FAIL "Not accepted")
+        set(${outvar} "" CACHE INTERNAL "Test assembler flag ${flag}")
+      endif()
+    endif()
+  endif()
+endfunction()
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 02e6ecfbdb60e..6f5c2cd7d1971 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -60,6 +60,7 @@ endif()
 include(builtin-config-ix)
 include(CMakeDependentOption)
 include(CMakePushCheckState)
+include(CheckAssemblerFlag)
 
 option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS
   "Do not export any symbols from the static library." ON)
@@ -423,6 +424,40 @@ set(arm_or_thumb2_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
+option(COMPILER_RT_ARM_OPTIMIZED_FP
+  "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON)
+
+if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
+  check_assembler_flag(COMPILER_RT_HAS_MIMPLICIT_IT -mimplicit-it=always)
+  if(COMPILER_RT_HAS_MIMPLICIT_IT)
+    set(implicit_it_flag -mimplicit-it=always)
+  else()
+    check_assembler_flag(
+      COMPILER_RT_HAS_WA_MIMPLICIT_IT -Wa,-mimplicit-it=always)
+    if(COMPILER_RT_HAS_WA_MIMPLICIT_IT)
+      set(implicit_it_flag -Wa,-mimplicit-it=always)
+    else()
+      message(WARNING "Don't know how to set the -mimplicit-it=always flag in this assembler; not including Arm optimized implementations")
+      set(implicit_it_flag "")
+    endif()
+  endif()
+
+  if(implicit_it_flag)
+    set(assembly_files
+      arm/mulsf3.S
+      arm/divsf3.S)
+    set_source_files_properties(${assembly_files}
+      PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
+    set(arm_or_thumb2_base_SOURCES
+      ${assembly_files}
+      arm/fnan2.c
+      arm/fnorm2.c
+      arm/funder.c
+      ${arm_or_thumb2_base_SOURCES}
+      )
+  endif()
+endif()
+
 set(arm_sync_SOURCES
   arm/sync_fetch_and_add_4.S
   arm/sync_fetch_and_add_8.S
@@ -456,6 +491,16 @@ set(thumb1_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  set(thumb1_base_SOURCES
+    arm/thumb1/mulsf3.S
+    arm/fnan2.c
+    arm/fnorm2.c
+    arm/funder.c
+    ${thumb1_base_SOURCES}
+  )
+endif()
+
 set(arm_EABI_RT_SOURCES
   arm/aeabi_cdcmp.S
   arm/aeabi_cdcmpeq_check_nan.c
diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S
new file mode 100644
index 0000000000000..2f37234457b7b
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/divsf3.S
@@ -0,0 +1,608 @@
+//===-- divsf3.S - single-precision floating point division ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float division with the IEEE-754
+// default rounding (to nearest, ties to even), in optimized AArch32 assembly
+// language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fdiv, __divsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__divsf3)
+  // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23
+  // of each register so that there will be space lower down to store extra
+  // data without exponent arithmetic carrying into it. In the process, check
+  // both exponents for 00 or FF and branch out of line to handle all the
+  // uncommon types of value (infinity, NaN, zero, denormals).
+  //
+  // Chaining conditional instructions like this means that the second
+  // instruction (setting up r3) might not be executed at all, so fdiv_uncommon
+  // will have to redo it just in case. That saves an instruction here,
+  // executed for _all_ inputs, and moves it to the uncommon path run for only
+  // some inputs.
+  mov     r12, #0xFF0000
+  ands    r2, r12, r0, lsr #7   // r2 has exponent of numerator. (Is it 0?)
+  andsne  r3, r12, r1, lsr #7   // r3 has exponent of denominator. (Is it 0?)
+  teqne   r2, r12               // if neither was 0, is one FF?
+  teqne   r3, r12               // or the other?
+  beq     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
+
+  // Calculate the output sign, which is always just the XOR of the input
+  // signs. Store it in bit 8 of r2, below the numerator exponent.
+  teq     r0, r1                // is the output sign bit 1?
+  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Isolate the mantissas of both values, by setting bit 23 of each one and
+  // clearing the 8 bits above that.
+  //
+  // In the process, swap the register allocations (which doesn't cost extra
+  // instructions if we do it as part of this manipulation). We want the
+  // numerator not to be in r0, because r0 is where we'll build up the quotient
+  // while subtracting things from the numerator.
+  orr     r12, r0, #1 << 23
+  orr     r0, r1, #1 << 23
+  bic     r1, r12, #0xFF000000
+  bic     r0, r0, #0xFF000000
+
+LOCAL_LABEL(div):
+  // Start of the main division. We get here knowing that:
+  //
+  //   r0 = mantissa of denominator, with the leading 1 at bit 23
+  //   r1 = mantissa of numerator, similarly
+  //   r2 = (exponent of numerator << 16) + (result sign << 8)
+  //   r3 = (exponent of denominator << 16)
+
+  push    {r14}                 // we'll need an extra register
+
+  // Calculate the initial result exponent by just subtracting the two input
+  // exponents. This doesn't affect the sign bit lower down in r2.
+  sub     r2, r2, r3
+
+  // That initial exponent might need to be adjusted by 1, depending on whether
+  // dividing the mantissas gives a value >=1 or <1. We don't need to wait
+  // until the division is finished to work that out: we can tell immediately
+  // by just comparing the mantissas.
+  //
+  // The basic idea is to do the comparison in a way that sets the C flag if
+  // numerator >= denominator. Then we recombine the sign and exponent by doing
+  // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted
+  // down to the low 8 bits, just below the sign bit, and using ADC rather than
+  // ADD folds in the conditional increment from the mantissa comparison.
+  //
+  // If we're not incrementing the output exponent, we instead shift the
+  // numerator mantissa left by 1, so that it _is_ greater than the denominator
+  // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23.
+  //
+  // The exponent also needs to be rebiased, so that dividing two numbers the
+  // same gives an output exponent of 0x7F. If the two inputs have the same
+  // exponent then we'll have computed an exponent of 0 via the SUB instruction
+  // above; if the mantissas are the same as well then the ADC will increment
+  // it; also, the leading bit of the quotient will increment the exponent
+  // again when we recombine it with the output mantissa later. So we need to
+  // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB
+  // come to 0x7F after both of those increments.
+  //
+  // Putting all of that together, what we _want_ to do is this:
+  //
+  // [#1]   CMP     r1, r0                // set C if num >= den
+  // [#2]   MOVLO   r1, r1, lsl #1        // if num < den, shift num left
+  // [#3]   ADD     r2, r2, #0x7D0000     // rebias exponent
+  // [#4]   ADC     r2, r2, r2, asr #16   // combine sign + exp + adjustment
+  //
+  // However, we only do the first of those four instructions right here. The
+  // other three are distributed through the code below, after unrelated load
+  // or multiply instructions which will have a result delay slot on simple
+  // CPUs. Each is labelled "exponent setup [#n]" in a comment.
+  //
+  // (Since instruction #4 depends on the flags set up by #2, we must avoid
+  // clobbering the flags in _any_ of the instructions interleaved with this!)
+  cmp     r1, r0                // exponent setup [#1]
+
+  // Start the mantissa division by making an approximation to the reciprocal
+  // of the denominator. We first obtain an 8-bit approximation using a table
+  // lookup indexed by the top 7 denominator bits (counting the leading 1, so
+  // really there are only 6 bits in the table index).
+  //
+  // (r0 >> 17) is the table index, and its top bit is always set, so it ranges
+  // from 64 to 127 inclusive. So we point the base register 64 bytes before
+  // the actual table.
+  adr     r12, LOCAL_LABEL(tab) - 64
+#if __thumb__
+  // Thumb can't do this particular shift+add+load in one instruction - it only
+  // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must
+  // calculate the load offset separately.
+  add     r14, r12, r0, lsr #17
+  ldrb    r14, [r14]
+#else
+  ldrb    r14, [r12, r0, lsr #17]
+#endif
+
+  // Now do an iteration of Newton-Raphson to improve that 8-bit approximation
+  // to have 15-16 accurate bits.
+  //
+  // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d
+  // and you have some approximation x, your next approximation is X = x(2-dx).
+  // Looked at one way, this is the result of applying the N-R formula
+  // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it
+  // is to suppose that dx = 1 - e, for some e which is small (because dx is
+  // already reasonably close to 1). Then you want to double the number of
+  // correct bits in the next approximation, i.e. square the error. So you want
+  // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again.
+  //
+  // In this situation, we're working in fixed-point integers rather than real
+  // numbers, and all the scales are different:
+  //  * our input denominator d is in the range [2^23,2^24)
+  //  * our input approximation x is in the range [2^7,2^8)
+  //  * we want the output approximation to be in the range [2^15,2^16)
+  // Those factors combine to mean that we want
+  //   x(2^32-dx) / 2^23
+  // = (2^9 x) - (dx^2 / 2^23)
+  //
+  // But we also want to compute this using ordinary MUL, not a long multiply
+  // instruction (those are slower). So we need to worry about the product
+  // overflowing. dx fits in 32 bits, because it's the product of something
+  // <2^24 with something <2^8; but we must shift it right before multiplying
+  // by x again.
+
+  mul     r12, r0, r14          // r12  = dx
+  movlo   r1, r1, lsl #1        //   exponent setup [#2] in the MUL delay slot
+  mvn     r12, r12, lsr #8      // r12 ~= -dx/2^8
+  mul     r3, r12, r14          // r3  ~= -dx^2/2^8
+  mov     r14, r14, lsl #9      // r14  = 2^9 x
+  add     r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23
+
+  // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa,
+  // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16
+  // in principle, and the max mantissa 2^24-1 would have reciprocal just over
+  // 2^15). The error is always negative (r14 is an underestimate of the true
+  // value), and the maximum error is 6 and a bit ULP (that is, the true
+  // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly
+  // less than 0x10000 (even in the case of the min mantissa, where the true
+  // value would be _exactly_ 0x10000), which eliminates a case of integer
+  // overflow.
+  //
+  // All of these properties of the reciprocal approximation are checked by
+  // exhaustively iterating over all 2^23 possible input mantissas. (The nice
+  // thing about doing this in single rather than double precision!)
+  //
+  // Now we extract most of the quotient by two steps of long division, using
+  // the reciprocal estimate to identify a multiple of the denominator to
+  // subtract from the numerator. To avoid integer overflow, the numerator
+  // mantissa is shifted down 8 bits so that it's less than 0x10000. After we
+  // calculate an approximate quotient, we shift the numerator left and
+  // subtract that multiple of the denominator, moving the next portion of the
+  // numerator into range for the next iteration.
+
+  // First iteration of long division. We shift the numerator left 11 bits, and
+  // since the quotient approximation is scaled by 2^31, we must shift that
+  // right by 20 to make the right product to subtract from the numerator.
+  mov     r12, r1, lsr #8       // shift the numerator down
+  mul     r12, r14, r12         // make the quotient approximation
+  mov     r1, r1, lsl #11       // shift numerator left, ready for subtraction
+  mov     r3, r12, lsr #20      // make first 12-bit block of quotient bits
+  mls     r1, r0, r3, r1        // subtract that multiple of den from num
+
+  add     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
+
+  // Second iteration of long division. Differences from the first step: this
+  // time we shift the numerator 12 bits instead of 11, so that the total of
+  // both steps is 23 bits, i.e. we've shifted up by exactly the full width of
+  // the output mantissa. Also, the block of output quotient bits is left in a
+  // different register: it was in r3 the first time, and this time it's in
+  // r12, so that we still have both available at the end of the process.
+  mov     r12, r1, lsr #8       // shift the numerator down
+  mul     r12, r14, r12         // make the quotient approximation
+  mov     r1, r1, lsl #12       // shift numerator left, ready for subtraction
+  mov     r12, r12, lsr #19     // make second 11-bit block of quotient
+  mls     r1, r0, r12, r1       // subtract that multiple of den from num
+
+  adc     r2, r2, r2, asr #16   //   exponent setup [#4] in the MLS delay slot
+
+  // Now r1 contains the original numerator, shifted left 23, minus _some_
+  // multiple of the original denominator (which is still in r0). The bounds on
+  // the error in the above steps should make the error at most 1: that is, we
+  // may have to subtract the denominator one more time to make r1 < r0, and
+  // increment the quotient by one more.
+  //
+  // Our quotient is still in two pieces, computed separately in the above long
+  // division steps. We fold the final increment into the same instruction that
+  // recombines them, by doing the comparison in such a way that it sets the
+  // carry flag if the increment is needed.
+
+  cmp     r1, r0                // Set carry flag if num >= den
+  subhs   r1, r1, r0            // If so, subtract den from num
+  adc     r3, r12, r3, lsl #12  // Recombine quotient halves, plus optional +1
+
+  // We've finished with r14 as a temporary register, so we can unstack it now.
+  pop     {r14}
+
+  // Now r3 contains the _rounded-down_ output quotient, and r1 contains the
+  // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and
+  // 0 <= r1 < denominator.
+  //
+  // Next we must round to nearest, by checking if r1 is greater than half the
+  // denominator. In division, it's not possible to hit an exact round-to-even
+  // halfway case, so we don't need to spend any time checking for it.
+  //
+  // Proof of no round-to-even: define the 'width' of a dyadic rational to be
+  // the distance between the lowest and highest 1 bits in its binary
+  // representation, or equivalently, the index of its high bit if you scale it
+  // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has
+  // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3.
+  // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w
+  // be the maximum width that the input precision supports (so that for single
+  // precision, w=23). Then if some division n/d were a round-to-even case, the
+  // true quotient q=n/d would have width exactly w+1. But we have qd=n, so
+  // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the
+  // input precision, hence had width <= w.)
+  //
+  // So we don't need to check for an exact _halfway_ case and clear the low
+  // bit of the quotient after rounding up, as addition and multiplication both
+  // need to do. But we do need to remember if the quotient itself was exact,
+  // that is, if there was no remainder at all. That's needed in underflow
+  // handling.
+
+  // The rounding check wants to compare remainder with denominator/2. But of
+  // course in integers it's easier to compare 2*remainder with denominator. So
+  // we start by shifting the remainder left by 1, and in the process, set Z if
+  // it's exactly 0 (i.e. the result needs no rounding at all).
+  lsls    r1, r1, #1
+  // Now trial-subtract the denominator. We don't do this at all if the result
+  // was exact. If we do do it, r1 goes negative precisely if we need to round
+  // up, which sets the C flag. (The previous instruction will have left C
+  // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if
+  // we're rounding up.)
+  subsne  r1, r1, r0
+  // Recombine the quotient with the sign + exponent, and use the C flag from
+  // the previous instruction to increment the quotient if we're rounding up.
+  adc     r0, r3, r2, lsl #23
+
+  // If we haven't either overflowed or underflowed, we're done. We can
+  // identify most of the safe cases by doing an unsigned comparison of the
+  // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 <
+  // 0xFC0000 then we have neither underflow nor overflow.
+  //
+  // Rationale: the value in the top half of r2 had three chances to be
+  // incremented before becoming the exponent field of the actual output float.
+  // It was incremented if we found the numerator mantissa was >= the
+  // denominator (producing the value in the _bottom_ half of r2, which we just
+  // ADCed into the output). Then it gets unconditionally incremented again
+  // when the ADC combines it with the leading mantissa bit. And finally,
+  // round-up might increment it a third time. So 0xFC is the smallest value
+  // that can possibly turn into the overflowed value 0xFF after all those
+  // increments.
+  //
+  // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in
+  // the final result's exponent field (and then rounding might increase it
+  // further); if the exponent was less than that then r2 wraps round and looks
+  // like a very large positive integer from the point of view of this unsigned
+  // comparison.
+  cmp     r2, #0xFC0000
+  bxlo    lr
+
+  // The same comparison will have set the N and V flags to reflect the result
+  // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably
+  // distinguishes potential underflow (r2 is negative) from potential overflow
+  // (r2 is positive and at least 0xFC0000)
+  bge     LOCAL_LABEL(overflow)
+
+  // Here we might or might not have underflow (but we know we don't have
+  // overflow). To check more carefully, we look at the _bottom_ half of r2,
+  // which contains the exponent after the first adjustment (for num >= denom),
+  // That is, it's still off by 1 (compensating for the leading quotient bit),
+  // and is also before rounding.
+  //
+  // We neglect the effect of rounding: division results that are tiny (less
+  // than the smallest normalised number) before rounding, but then round up to
+  // the smallest normal number, are an acceptable edge case to handle slowly.
+  // We pass those to funder without worrying about them.
+  //
+  // So we want to check whether the bottom half of r2 was negative. It would
+  // be nice to check bits 8-15 of it, but unfortunately, it's already been
+  // combined with the sign (at bit 8), so those bits don't tell us anything
+  // useful. Instead we look at the top 4 bits of the exponent field, i.e. the
+  // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is
+  // less than 3, so it doesn't reach those bits; the smallest possible
+  // underflow, obtained by dividing the smallest denormal by the largest
+  // finite number, is -151 (before the leading bit increments it), which will
+  // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be
+  // 0x60 or greater for a (pre-rounding) underflow, and zero for a
+  // non-underflow.
+
+  tst     r2, #0xF0
+  bxeq    lr                    // no underflow after all; return
+
+  // Rebias the exponent for funder, which also corrects the sign bit.
+  add     r0, r0, #192 << 23
+  // Tell funder whether the true value is greater or less than the number in
+  // r0. This is obtained from the sign of the remainder (still in r1), with
+  // the only problem being that it's currently reversed. So negate r1 (leaving
+  // 0 at 0 to indicate exactness).
+  rsbs    r1, r1, #0
+  b     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // Here we might or might not have overflow (but we know we don't have
+  // underflow). We must check whether we really have overflowed.
+  //
+  // For this it's easiest to check the exponent field in the actual output
+  // value in r0, after _all_ the adjustments have been completed. The largest
+  // overflowed exponent is 0x193, and the smallest exponent that can reach
+  // this is 0xFD (we checked against 0xFC above, but then the leading quotient
+  // bit incremented it). So it's enough to shift the output left by one
+  // (moving the exponent field to the top), increment it once more (so that
+  // the smallest overflowed exponent 0xFF wraps round to 0), and then compare
+  // against 0xFE000000 as an unsigned integer.
+  mov     r12, r0, lsl #1
+  add     r12, r12, #1 << 24
+  cmp     r12, #0xFE << 24      // Check for exp = 253 or 254
+  bxhs    lr
+  // We have actual overflow. Rebias r0 to bring the exponent back into range,
+  // which ensures its sign is correct. Then make an infinity of that sign to
+  // return.
+  subs    r0, r0, #0xC0 << 23
+  movs    r12, #0xFF            // exponent of infinity
+  orrs    r12, r12, r0, lsr #23 // exponent and sign at bottom of r12
+  movs    r0, r12, lsl #23      // shift it up to the top of r0 to return
+  bx      lr
+
+LOCAL_LABEL(uncommon):
+  // We come here from the start of the function if either input is an uncommon
+  // value: zero, denormal, infinity or NaN.
+  //
+  // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x
+  // in bits 16..23. But r3 doesn't necessarily contain the exponent of y,
+  // because the instruction that set it up was conditional. So first we
+  // unconditionally repeat it.
+  and     r3, r12, r1, lsr #7
+
+  // In all cases not involving a NaN as output, the sign of the output is made
+  // in the same way as for finite numbers, as the XOR of the input signs. So
+  // repeat the sign setup from the main branch.
+  teq     r0, r1                // is the output sign bit 1?
+  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Detect infinities and NaNs, by checking if either of r2 or r3 is at least
+  // 0xFF0000.
+  cmp     r2, #0xFF0000
+  cmplo   r3, #0xFF0000
+  bhs     LOCAL_LABEL(inf_NaN)
+
+  // Now we know there are no infinities or NaNs, but there's at least one zero
+  // or denormal.
+  movs    r12, r1, lsl #1       // is y zero?
+  beq     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
+  movs    r12, r0, lsl #1       // is x zero? (now we know that y is not)
+  moveq   r0, r2, lsl #23       // if so, 0/nonzero is just 0 (of right sign)
+  bxeq    lr
+
+  // Now we've eliminated zeroes as well, leaving only denormals: either x or
+  // y, or both, is a denormal. Call fnorm2 to convert both into a normalised
+  // mantissa and a (potentially small) exponent.
+  and     r12, r2, #0x100       // save the result sign from r2
+  lsr     r2, #16               // shift extracted exponents down to bit 0
+  lsr     r3, #16               // where fnorm2 will expect them
+  push    {r0, r1, r2, r3, r12, lr}
+  mov     r0, sp                // tell fnorm2 where to find its data
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0, r1, r2, r3, r12, lr}
+  lsl     r3, #16               // shift exponents back up to bit 16
+  orr     r2, r12, r2, lsl #16  // and put the result sign back in r2
+
+  // Now rejoin the main code path, having finished the setup it will expect:
+  // swap x and y, and shift the fractions back down to the low 24 bits.
+  mov     r12, r0, lsr #8
+  mov     r0, r1, lsr #8
+  mov     r1, r12
+  b       LOCAL_LABEL(div)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 to propagate a NaN from the
+  // input.
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and
+  // finite/inf = 0.
+
+  // If both operands are infinity, we return a NaN. Since we know at
+  // least _one_ is infinity, we can test this by checking if they're
+  // equal apart from the sign bits.
+  eor     r3, r0, r1
+  lsls    r3, #1                // were all bits of XOR zero other than top?
+  beq     LOCAL_LABEL(invalid)          // if so, both operands are infinity
+
+  // See if x is infinite
+  cmp     r12, r0, lsl #1       // (r0 << 1) == 0xFF000000?
+  beq     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
+
+  // y is infinite and x is not, so we return a zero of the
+  // combined sign.
+  eor     r0, r0, r1            // calculate the right sign
+  and     r0, r0, #0x80000000   // throw away everything else
+  bx      lr
+
+LOCAL_LABEL(divbyzero):
+  // Here, we know y is zero. But we don't know if x is zero or nonzero. So we
+  // might be calculating 0/0 (invalid operation, generating a NaN), or
+  // nonzero/0 (the IEEE "division by zero" exception, generating infinity).
+  movs    r12, r0, lsl #1       // is x zero too?
+  beq     LOCAL_LABEL(invalid)          // if so, go and return a NaN
+
+LOCAL_LABEL(infret):
+  // Here, we're either dividing infinity by a finite number, or dividing a
+  // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all
+  // these cases we return infinity with the sign from r2.
+  //
+  // If we were implementing IEEE exceptions, we'd have to separate these
+  // cases: infinity / finite is not an _exception_, it just returns infinity,
+  // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here
+  // we're not implementing exceptions, so we can treat all three cases the
+  // same.
+  //
+  // r2 contains the output sign in bit 8, which is a convenient place to find
+  // it when making an infinity, because we can fill in the 8 exponent bits
+  // below that and then shift it left.
+  orr     r2, r2, #0xff         // sign + maximum exponent
+  lsl     r0, r2, #23           // shift up to the top
+  bx      lr
+
+LOCAL_LABEL(invalid):
+  // Return the default NaN, from an invalid operation (either dividing
+  // infinity by infinity, or 0 by 0).
+  ldr     r0, =0x7FC00000
+  bx      lr
+
+// Finally, the lookup table for the initial reciprocal approximation.
+//
+// The table index is made from the top 7 bits of the denominator mantissa. But
+// the topmost bit is always 1, so only the other 6 bits vary. So it only has
+// 64 entries, not 128.
+//
+// Each table entry is a single byte, with its top bit set. So the table
+// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by
+// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31.
+//
+// Each of these 64 entries corresponds to a large interval of possible
+// mantissas. For example, if the top 7 bits are 1000001 then the overall
+// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output
+// of this table provides more bits than the input, there are several choices
+// of 8-bit reciprocal approximation for a number in that interval. The
+// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal
+// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable
+// choices for that table entry: F9, FA, FB or FC. Which do we pick?
+//
+// The table below is generated by choosing whichever value minimises the
+// maximum possible error _after_ the approximation is improved by the
+// Newton-Raphson step. In the example above, we end up with FA.
+//
+// The Python code below will regenerate the table, complete with the per-entry
+// comments.
+
+/*
+
+for prefix in range(64, 128):
+    best = None
+
+    # Max and min 23-bit mantissas with this 7-bit prefix
+    mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1
+
+    # Max and min table entry corresponding to the reciprocal of something in
+    # that range of mantissas: round up the reciprocal of mmax, and round down
+    # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because
+    # 0x100 can't be used as a table entry due to not fitting in a byte, even
+    # though it's the exact reciprocal of the overall-smallest mantissa
+    # 0x800000.
+    gmin = max(128, (2**31 + mmin - 1) // mmax)
+    gmax = min(255, 2**31 // mmin)
+
+    # For each of those table entries, compute the result of starting from that
+    # value and doing a Newton-Raphson iteration, with the mantissa at each end
+    # of the mantissa interval. One of these will be the worst possible error.
+    # Choose the table entry whose worst error is as small as possible.
+    #
+    # (To find the extreme values of a more general function on an interval,
+    # you must consider its values not only at the interval endpoints but also
+    # any turning points within the interval. Here, the function has only one
+    # turning point, and by construction it takes value 0 there, so we needn't
+    # worry.)
+    g = max(
+        range(gmin, gmax + 1),
+        key=lambda g: min(
+            (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax]
+        ),
+    )
+
+    print(f"  .byte 0x{g:02x}  // input [0x{mmin:06x},0x{mmax:06x}]"
+          f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]"
+    )
+
+*/
+
+  .p2align 2  // make sure we start on a 4-byte boundary, even in Thumb
+LOCAL_LABEL(tab):
+  .byte 0xfe  // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff]
+  .byte 0xfa  // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc]
+  .byte 0xf6  // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8]
+  .byte 0xf3  // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4]
+  .byte 0xef  // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0]
+  .byte 0xec  // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed]
+  .byte 0xe8  // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea]
+  .byte 0xe5  // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6]
+  .byte 0xe2  // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3]
+  .byte 0xdf  // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0]
+  .byte 0xdc  // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd]
+  .byte 0xd9  // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda]
+  .byte 0xd6  // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7]
+  .byte 0xd3  // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4]
+  .byte 0xd1  // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2]
+  .byte 0xce  // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf]
+  .byte 0xcc  // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc]
+  .byte 0xc9  // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca]
+  .byte 0xc7  // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7]
+  .byte 0xc4  // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5]
+  .byte 0xc2  // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3]
+  .byte 0xc0  // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0]
+  .byte 0xbd  // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe]
+  .byte 0xbb  // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc]
+  .byte 0xb9  // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba]
+  .byte 0xb7  // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8]
+  .byte 0xb5  // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6]
+  .byte 0xb3  // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4]
+  .byte 0xb1  // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2]
+  .byte 0xaf  // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0]
+  .byte 0xad  // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae]
+  .byte 0xac  // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac]
+  .byte 0xaa  // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa]
+  .byte 0xa8  // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8]
+  .byte 0xa6  // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7]
+  .byte 0xa5  // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5]
+  .byte 0xa3  // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3]
+  .byte 0xa1  // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2]
+  .byte 0xa0  // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0]
+  .byte 0x9e  // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f]
+  .byte 0x9d  // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d]
+  .byte 0x9b  // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c]
+  .byte 0x9a  // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a]
+  .byte 0x98  // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99]
+  .byte 0x97  // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97]
+  .byte 0x96  // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96]
+  .byte 0x94  // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94]
+  .byte 0x93  // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93]
+  .byte 0x92  // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92]
+  .byte 0x90  // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90]
+  .byte 0x8f  // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f]
+  .byte 0x8e  // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e]
+  .byte 0x8d  // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d]
+  .byte 0x8b  // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c]
+  .byte 0x8a  // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a]
+  .byte 0x89  // input [0xee0000,0xefffff], candidate outputs [0x89,0x89]
+  .byte 0x88  // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88]
+  .byte 0x87  // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87]
+  .byte 0x86  // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86]
+  .byte 0x85  // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85]
+  .byte 0x84  // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84]
+  .byte 0x83  // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83]
+  .byte 0x82  // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82]
+  .byte 0x81  // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81]
+
+END_COMPILERRT_FUNCTION(__divsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c
new file mode 100644
index 0000000000000..06bbd4339f171
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fnan2.c
@@ -0,0 +1,42 @@
+//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations to handle propagating NaNs from the input
+// operands to the output, in a way that matches Arm hardware FP.
+//
+// On input, a and b are floating-point numbers in IEEE 754 encoding, and at
+// least one of them must be a NaN. The return value is the correct output NaN.
+//
+// A signalling NaN in the input (with bit 22 clear) takes priority over any
+// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If
+// both inputs are the same type of NaN then the first input takes priority:
+// the input a is used instead of b.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
+  // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at
+  // the bit position where the quiet vs signalling bit ended up. This squashes
+  // all the signalling NaNs to the top of the range of 32-bit values, from
+  // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values
+  // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect
+  // a signalling NaN by asking if it's greater than 0xff800000, and a quiet
+  // one by asking if it's less than 0x00800000.
+  uint32_t aadj = (a << 1) + 0x00800000;
+  uint32_t badj = (b << 1) + 0x00800000;
+  if (aadj > 0xff800000)   // a is a signalling NaN?
+    return a | 0x00400000; //   if so, return it with the quiet bit set
+  if (badj > 0xff800000)   // b is a signalling NaN?
+    return b | 0x00400000; //   if so, return it with the quiet bit set
+  if (aadj < 0x00800000)   // a is a quiet NaN?
+    return a;              // if so, return it
+  return b;                // otherwise we expect b must be a quiet NaN
+}
diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c
new file mode 100644
index 0000000000000..29eba1cbde59d
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fnorm2.c
@@ -0,0 +1,62 @@
+//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations, to handle denormal inputs on entry by
+// renormalizing the mantissa and modifying the exponent to match.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+// Structure containing the function's inputs and outputs.
+//
+// On entry: a, b are two input floating-point numbers, still in IEEE 754
+// encoding. expa and expb are the 8-bit exponents of those numbers, extracted
+// and shifted down to the low 8 bits of the word, with no other change.
+// Neither value should be zero, or have the maximum exponent (indicating an
+// infinity or NaN).
+//
+// On exit: each of a and b contains the mantissa of the input value, with the
+// leading 1 bit made explicit, and shifted up to the top of the word. If expa
+// was zero (indicating that a was denormal) then it is now represented as a
+// normalized number with an out-of-range exponent (zero or negative). The same
+// applies to expb and b.
+struct fnorm2 {
+  uint32_t a, b, expa, expb;
+};
+
+void __compiler_rt_fnorm2(struct fnorm2 *values) {
+  // Shift the mantissas of a and b to the right place to follow a leading 1 in
+  // the top bit, if there is one.
+  values->a <<= 8;
+  values->b <<= 8;
+
+  // Test if a is denormal.
+  if (values->expa == 0) {
+    // If so, decide how much further up to shift its mantissa, and adjust its
+    // exponent to match. This brings the leading 1 of the denormal mantissa to
+    // the top of values->a.
+    uint32_t shift = __builtin_clz(values->a);
+    values->a <<= shift;
+    values->expa = 1 - shift;
+  } else {
+    // Otherwise, leave the mantissa of a in its current position, and OR in
+    // the explicit leading 1.
+    values->a |= 0x80000000;
+  }
+
+  // Do the same operation on b.
+  if (values->expb == 0) {
+    uint32_t shift = __builtin_clz(values->b);
+    values->b <<= shift;
+    values->expb = 1 - shift;
+  } else {
+    values->b |= 0x80000000;
+  }
+}
diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c
new file mode 100644
index 0000000000000..fd29e157328a3
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/funder.c
@@ -0,0 +1,78 @@
+//===-- funder.c - Handle single-precision floating-point underflow -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations to handle underflowed output values, if they were
+// computed in the form of a normalized mantissa and an out-of-range exponent.
+//
+// On input: x should be a complete IEEE 754 floating-point value representing
+// the desired output scaled up by 2^192 (the same value that would have been
+// passed to an underflow trap handler in IEEE 754:1985).
+//
+// This isn't enough information to re-round to the correct output denormal
+// without also knowing whether x itself has already been rounded, and which
+// way. 'errsign' gives this information, by indicating the sign of the value
+// (true result - x). That is, if errsign > 0 it means the true value was
+// larger (x was rounded down); if errsign < 0 then x was rounded up; if
+// errsign == 0 then x represents the _exact_ desired output value.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#define SIGNBIT 0x80000000
+#define MANTSIZE 23
+#define BIAS 0xc0
+
+uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) {
+  uint32_t sign = x & SIGNBIT;
+  uint32_t exponent = (x << 1) >> 24;
+
+  // Rule out exponents so small (or large!) that no denormalisation
+  // is needed.
+  if (exponent > BIAS) {
+    // Exponent 0xc1 or above means a normalised number got here by
+    // mistake, so we just remove the 0xc0 exponent bias and go
+    // straight home.
+    return x - (BIAS << MANTSIZE);
+  }
+  uint32_t bits_lost = BIAS + 1 - exponent;
+  if (bits_lost > MANTSIZE + 1) {
+    // The implicit leading 1 of the intermediate value's mantissa is
+    // below the lowest mantissa bit of a denormal by at least 2 bits.
+    // Round down to 0 unconditionally.
+    return sign;
+  }
+
+  // Make the full mantissa (with leading bit) at the top of the word.
+  uint32_t mantissa = 0x80000000 | (x << 8);
+  // Adjust by 1 depending on the sign of the error.
+  mantissa -= errsign >> 31;
+  mantissa += (-errsign) >> 31;
+
+  // Shift down to the output position, keeping the bits shifted off.
+  uint32_t outmant, shifted_off;
+  if (bits_lost == MANTSIZE + 1) {
+    // Special case for the exponent where we have to shift the whole
+    // of 'mantissa' off the bottom of the word.
+    outmant = 0;
+    shifted_off = mantissa;
+  } else {
+    outmant = mantissa >> (8 + bits_lost);
+    shifted_off = mantissa << (32 - (8 + bits_lost));
+  }
+
+  // Re-round.
+  if (shifted_off >> 31) {
+    outmant++;
+    if (!(shifted_off << 1))
+      outmant &= ~1; // halfway case: round to even
+  }
+
+  return sign | outmant;
+}
diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S
new file mode 100644
index 0000000000000..b4f4c5e958c52
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/mulsf3.S
@@ -0,0 +1,309 @@
+//===-- mulsf3.S - single-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float multiplication with the
+// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32
+// assembly language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__mulsf3)
+
+  // Check if either input exponent is 00 or FF (i.e. not a normalized number),
+  // and if so, branch out of line. If we don't branch out of line, then we've
+  // also extracted the exponents of the input values r0/r1 into bits 16..23 of
+  // r2/r3. But if we do, then that hasn't necessarily been done (because the
+  // second AND might have been skipped).
+  mov     r12, #0xFF0000
+  ands    r2, r12, r0, lsr #7  // sets Z if exponent of x is 0
+  andsne  r3, r12, r1, lsr #7  // otherwise, sets Z if exponent of y is 0
+  teqne   r2, r12              // otherwise, sets Z if exponent of x is FF
+  teqne   r3, r12              // otherwise, sets Z if exponent of y is FF
+  beq     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
+
+  // Calculate the sign of the result, and put it in an unused bit of r2.
+  teq     r0, r1               // sets N to the XOR of x and y's sign bits
+  orrmi   r2, r2, #0x100       // if N set, set bit 8 of r2
+
+  // Move the input mantissas to the high end of r0/r1, each with its leading
+  // bit set explicitly, so that they're in the right form to be multiplied.
+  mov     r12, #0x80000000
+  orr     r0, r12, r0, lsl #8
+  orr     r1, r12, r1, lsl #8
+
+  // Now we're ready to multiply mantissas. This is also the place we'll come
+  // back to after decoding denormal inputs. The denormal decoding will also
+  // have to set up the same register contents:
+  //  - decoded fractions at the top of r0 and r1
+  //  - exponents in r2 and r3, starting at bit 16
+  //  - output sign in r2 bit 8
+LOCAL_LABEL(mul):
+
+  // Here we multiply the mantissas, and compute the output exponent by adding
+  // the input exponents and rebiasing. These operations are interleaved to
+  // use a delay slot.
+  //
+  // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd
+  // expect. That compensates for the leading bit of the mantissa overlapping
+  // it, when we recombine the exponent and mantissa by addition.
+  add     r2, r2, r3           // r2 has sum of exponents, freeing up r3
+  umull   r1, r3, r0, r1       // r3:r1 has the double-width product
+  sub     r2, r2, #(0x80 << 16) // rebias the summed exponent
+
+  // Compress the double-word product into just the high-order word r3, by
+  // setting its bit 0 if any bit of the low-order word is nonzero. This
+  // changes the represented value, but not by nearly enough to affect
+  // rounding, because rounding only depends on the bit below the last output
+  // bit, and the general question of whether _any_ nonzero bit exists below
+  // that.
+  cmp     r1, #0                // if low word of full product is nonzero
+  orrne   r3, r3, #1            //   then set LSB of high word
+
+  // The two inputs to UMULL had their high bits set, that is, were at least
+  // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e.
+  // the high bit of the product could be at the top of the word or one bit
+  // below. Check which, by experimentally shifting left, and then undoing it
+  // via RRX if we turned out to have shifted off a 1 bit.
+  lsls    r3, r3, #1            // shift left, setting C to the bit shifted off
+  rrxcs   r3, r3                // if that bit was 1, put it back again
+
+  // That ensured the leading 1 bit of the product is now the top of r3, but
+  // also, set C if the leading 1 was _already_ in the top bit. So now we know
+  // whether to increment the exponent. The following instruction does the
+  // conditional increment (because it's ADC), but also, copies the exponent
+  // field from bit 16 of r2 into bit 0, so as to place it just below the
+  // output sign bit.
+  //
+  // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2
+  // are exactly what we need to combine with the rounded mantissa. But the
+  // full output exponent (with extra bits) is still available in the high half
+  // of r2, so that we can check _whether_ we overflowed or underflowed.
+  adc     r2, r2, r2, asr #16
+
+  // Recombine the exponent and mantissa, doing most of the rounding as a side
+  // effect: we shift the mantissa right so as to put the round bit into C, and
+  // then we recombine with the exponent using ADC, to increment the mantissa
+  // if C was set.
+  movs    r12, r3, lsr #8
+  adc     r0, r12, r2, lsl #23
+
+  // To complete the rounding, we must check for the round-to-even tiebreaking
+  // case, by checking if we're in the exact halfway case, which occurs if and
+  // only if we _did_ round up (we can tell this because C is still set from
+  // the MOVS), and also, no bit of r3 is set _below_ the round bit.
+  //
+  // We combine this with an overflow check, so that C ends up set if anything
+  // weird happened, and clear if we're completely finished and can return.
+  //
+  // The best instruction sequence for this part varies between Arm and Thumb.
+#if !__thumb__
+  // Arm state: if C was set then we check the low bits of r3, so that Z ends
+  // up set if we need to round to even.
+  //
+  // (We rely here on Z reliably being clear to begin with, because shifting
+  // down the output mantissa definitely gave a nonzero output. Also, the TST
+  // doesn't change C, so if Z does end up set, then C was also set.)
+  //
+  // Then, if we're not rounding to even, we do a CMP which sets C if there's
+  // been an overflow or an underflow. An overflow could occur for an output
+  // exponent as low as 0xFC, because we might increment the exponent by 1 when
+  // renormalizing, by another when recombining with the mantissa, and by one
+  // more if rounding up causes a carry off the top of the mantissa. An
+  // underflow occurs only if the output exponent is negative (because it's
+  // offset by 1, so an exponent of 0 will be incremented to 1), in which case
+  // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to
+  // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also
+  // catches a few very large cases that _don't_ quite overflow (exponents of
+  // 0xFC and above that don't get maximally unlucky); those will also be
+  // handled by the slow path.
+  tstcs   r3, #0x7F
+  cmpne   r2, #0xFC0000
+#else
+  // In Thumb, switching between different conditions has a higher cost due to
+  // the (implicit in this code) IT instructions, so we prefer a strategy that
+  // uses CC and CS conditions throughout, at the cost of requiring some extra
+  // cleanup instructions on the slow path.
+  //
+  // If C is set (and hence round-to-even is a possibility), the basic idea is
+  // to shift the full result word (r3) left by 25, leaving only its bottom 7
+  // bits, which are now the top 7 bits; then we want to set C iff these are 0.
+  //
+  // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this
+  // could be done in one instruction if only we had a register to use as x,
+  // which has 0 in the top 7 bits and at least one nonzero. Then we could
+  // compare that against the shifted-up value of r3, setting C precisely if
+  // the top 7 bits of y are greater than 0. And happily, we _do_ have such a
+  // register! r12 contains the shifted-down mantissa, which is guaranteed to
+  // have a 1 in bit 23, and 0 above that.
+  //
+  // The shift of r3 happens only in the second operand of the compare, so we
+  // don't lose the original value of r3 in this process.
+  //
+  // The check for over/underflow is exactly as in the Arm branch above, except
+  // based on a different condition.
+  cmpcs   r12, r3, lsl #25  // now C is set iff we're rounding to even
+  cmpcc   r2, #0xFC0000     // and now it's also set if we've over/underflowed
+#endif
+
+  // That's all the checks for difficult cases done. If C is clear, we can
+  // return.
+  bxcc    lr
+
+  // Now the slower path begins. We have to recover enough information to
+  // handle all of round-to-even, overflow and underflow.
+  //
+  // Round to even is the most likely of these, so we detect it first and
+  // handle it as fast as possible.
+
+#if __thumb__
+  // First, Thumb-specific compensation code. The Arm branch of the #if above
+  // will have set Z=0 to indicate round to even, but the Thumb branch didn't
+  // leave any unambiguous indicator of RTE, so we must retest by checking all
+  // the bits shifted off the bottom of the mantissa to see if they're exactly
+  // the half-way value.
+  lsl     r12, r3, #24           // r12 = round bit and everything below
+  cmp     r12, #0x80000000       // set Z if that is exactly 0x80000000
+#endif
+
+  // Now Z is clear iff we have already rounded up and now must replace that
+  // with rounding to even, which is done by just clearing the low bit of the
+  // mantissa.
+  biceq   r0, r0, #1
+
+  // Redo the over/underflow check (the same way as in both branches above),
+  // and if it doesn't report a danger, we can return the rounded-to-even
+  // answer.
+  cmp     r2, #0xFC0000         // check for over/underflow
+  bxcc    lr                    // and return if none.
+
+  // Now we only have overflow and underflow left to handle. First, find out
+  // which we're looking at. This is easy by testing the top bit of r2, but
+  // even easier by using the fact that the possible positive and negative
+  // values of r2 are widely enough separated that the 0xFC0000 subtracted by
+  // the CMP above won't have made any difference. So the N flag output from
+  // that comparison _already_ tells us which condition we have: if N is set we
+  // have underflow, and if N is clear, overflow.
+  bpl     LOCAL_LABEL(overflow)
+
+  // Here we're handling underflow.
+
+  // Add the IEEE 754:1985 exponent bias which funder will expect. This also
+  // brings the exponent back into a range where it can't possibly have carried
+  // into the sign bit, so the output sign will now be right.
+  add     r0, r0, #(0xC0 << 23)
+
+  // Determine whether we rounded up, down or not at all.
+  lsls    r2, r3, #1              // input mantissa, without its leading 1
+  subs    r1, r2, r0, lsl #9      // subtract the output mantissa (likewise)
+
+  // And let funder handle the rest.
+  b     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // We come here to handle overflow, but it's not guaranteed that an overflow
+  // has actually happened: our check on the fast path erred on the side of
+  // caution, by catching any output exponent that _could_ cause an overflow.
+  // So first check whether this really is an overflow, by extracting the
+  // output exponent. Exponent 0xFF, or anything that wrapped round to having
+  // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows.
+  //
+  // The value in r0 is correct to return, if there's no overflow.
+  add     r12, r0, #(1 << 23)     // add 1 to the exponent so 0xFF wraps to 0
+  movs    r12, r12, lsl #1        // test the top bit of the modified value
+  bxmi    lr                      // if top bit is still 1, not an overflow
+
+  // This is an overflow, so we need to replace it with an appropriately signed
+  // infinity. First we correct the sign by applying a downward bias to the
+  // exponent (the one suggested in IEEE 754:1985, which was chosen to bring
+  // all possible overflowed results back into range).
+  subs    r0, r0, #(0xC0 << 23)
+
+  // Now the sign bit of r0 is correct. Replace everything else with the
+  // encoding of an infinity.
+  mov     r1, #0xFF
+  and     r0, r0, #0x80000000
+  orr     r0, r0, r1, lsl #23
+  bx      lr
+
+LOCAL_LABEL(uncommon):
+  // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that
+  // we've at least done the first _two_ instructions from the entry point,
+  // even if all the rest were skipped. So r2 contains the sign and exponent of
+  // x in bits 16..23, and r12 = 0xFF << 16.
+  //
+  // So, first repeat some instructions from the prologue, which were either
+  // conditionally skipped in the sequence leading to the branch, or skipped
+  // because they happened after the branch.
+  and     r3, r12, r1, lsr #7  // get exponent of y in r3 bits 16..23
+  teq     r0, r1               // calculate the sign of the result
+  orrmi   r2, r2, #0x100       // and put it in bit 8 of r2 as before
+
+  // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at
+  // least 0xFF0000 (hence the exponent field is equal to 0xFF).
+  cmp     r2, r12
+  cmplo   r3, r12
+  bhs     LOCAL_LABEL(inf_NaN)
+
+  // If we didn't take that branch, then we have only finite numbers, but at
+  // least one is denormal or zero. A zero makes the result easy (and also is a
+  // more likely input than a denormal), so check those first, as fast as
+  // possible.
+  movs    r12, r0, lsl #1          // Z set if x == 0
+  movsne  r12, r1, lsl #1          // now Z set if either input is 0
+  moveq   r0, r2, lsl #23          // in either case, make 0 of the output sign
+  bxeq    lr                       // and return it
+
+  // Now we know we only have denormals to deal with. Call fnorm2 to sort
+  // them out, and rejoin the main code path above.
+  and     r12, r2, #0x100          // save the result sign from r2
+  lsr     r2, #16                  // shift extracted exponents down to bit 0
+  lsr     r3, #16                  // where fnorm2 will expect them
+  push    {r0, r1, r2, r3, r12, lr}
+  mov     r0, sp                   // tell fnorm2 where to find its data
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0, r1, r2, r3, r12, lr}
+  lsl     r3, #16                  // shift exponents back up to bit 16
+  orr     r2, r12, r2, lsl #16     // and put the result sign back in r2
+  b       LOCAL_LABEL(mul)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 which will propagate a NaN from
+  // the input; otherwise any multiplication involving infinity returns
+  // infinity, unless it's infinity * 0 which is an invalid operation and
+  // returns NaN again.
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // NaNs are dealt with, so now we have at least one infinity. Check if the
+  // other operand is 0. This is conveniently done by XORing the two: because
+  // we know that the low 31 bits of one operand are exactly 0x7F800000, we can
+  // test if the low 31 bits of the other one are all 0 by checking whether the
+  // low 31 bits of (x XOR y) equal 0x7F800000.
+  eor     r3, r0, r1
+  cmp     r12, r3, lsl #1          // if inf * 0, this sets Z
+  lsr     r0, r12, #1              // set up return value of +infinity
+  orrne   r0, r0, r2, lsl #23      // if not inf * 0, put on the output sign
+  orreq   r0, r0, #0x400000        // otherwise, set the 'quiet NaN' bit
+  bx      lr                       // and return
+
+END_COMPILERRT_FUNCTION(__mulsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
new file mode 100644
index 0000000000000..f2ede1013a9e6
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
@@ -0,0 +1,251 @@
+//===-- mulsf3.S - single-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float multiplication with the
+// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1
+// assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .thumb
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3)
+  push {r4,r5,r6,lr}
+
+  // Get exponents of the inputs, and check for uncommon values. In the process
+  // of this we also compute the sign, because it's marginally quicker that
+  // way.
+  lsls    r2, r0, #1
+  adcs    r4, r4, r4    // set r4[0] to sign bit of x
+  lsls    r3, r1, #1
+  adcs    r4, r4, r3    // set r4[0] to the output sign
+  lsrs    r2, r2, #24
+  beq     LOCAL_LABEL(zerodenorm0)   // still do the next LSRS
+  lsrs    r3, r3, #24
+  beq     LOCAL_LABEL(zerodenorm)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(naninf)
+  // Compute the output exponent. We'll be generating our product _without_ the
+  // leading bit, so we subtract 0x7f rather than 0x80.
+  adds    r2, r2, r3
+  subs    r2, r2, #0x7f
+  // Blank off everything above the mantissas.
+  lsls    r0, r0, #9
+  lsls    r1, r1, #9
+LOCAL_LABEL(normalised): // we may come back here from zerodenorm
+  lsrs    r0, r0, #9
+  lsrs    r1, r1, #9
+  // Multiply. r0 and r1 are the mantissas of the inputs but without their
+  // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23).
+  // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half.
+  //
+  // The technique below will actually compute P - 2^46, by not adding on the
+  // term where the two 2^23 are multiplied. The 48-bit result will be
+  // delivered in two output registers, one containing its bottom 32 bits and
+  // the other containing the top 32, so they overlap in the middle 16 bits.
+  // This is done using only two multiply instructions and some bookkeeping.
+  //
+  // In the comments I'll write X and Y for the original input mantissas (again
+  // without their leading bits). I'll also decompose them as X = xh + xl and
+  // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are
+  // multiples of 2^8.
+  adds    r5, r0, r1
+  lsls    r5, r5, #7    // r5 = (X+Y) << 7
+  movs    r6, r0
+  muls    r6, r1, r6    // r6 is congruent mod 2^32 to X*Y
+  lsrs    r0, r0, #8
+  lsrs    r1, r1, #8
+  muls    r0, r1, r0
+  lsls    r1, r0, #16   // r1 is congruent mod 2^32 to xh*yh
+  subs    r3, r6, r1    // now r3 is congruent mod 2^32 to
+                        //   (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl
+                        //   and hence, since that is at most 0xfeff0001,
+                        //   is _exactly_ equal to that
+  adds    r0, r0, r5    // r0 is now (xh*yh + (X+Y)<<23) >> 16
+  lsrs    r1, r3, #16   // r1 is the top 16 bits of r3, i.e.
+                        //   (xh*yl + xl*yh + xl*yl) >> 16
+  adds    r3, r0, r1    // now r3 equals
+                        //   (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16
+                        //   i.e. (X*Y + (X+Y)<<23) >> 16,
+                        //   i.e. (the right answer) >> 16.
+                        // Meanwhile, r6 is exactly the bottom 32 bits of the
+                        // right answer.
+  // Renormalise if necessary.
+  lsrs    r1, r3, #30
+  beq     LOCAL_LABEL(norenorm)
+  // Here we have to do something fiddly. Renormalisation would be a trivial
+  // job if we had the leading mantissa bit - just note that it's one bit
+  // position above where it should be, and shift right by one. But without
+  // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just
+  // shifting right would of course give us (x - 2^29), so we must subtract an
+  // extra 2^29 to fix this up.
+  lsrs    r3, r3, #1
+  movs    r1, #1
+  lsls    r1, r1, #29
+  subs    r3, r3, r1
+  adds    r2, r2, #1
+LOCAL_LABEL(norenorm):
+  // Round and shift down to the right bit position.
+  lsrs    r0, r3, #7    // round bit goes into the carry flag
+  bcc     LOCAL_LABEL(rounded)
+  adds    r0, r0, #1
+  // In the round-up branch, we must also check if we have to round to even, by
+  // testing all the bits below the round bit. We will normally not expect to,
+  // so we do RTE by branching out of line and back again to avoid spending a
+  // branch in the common case.
+  lsls    r5, r3, #32-7+1  // check the bits shifted out of r3 above
+  bne     LOCAL_LABEL(rounded)          // if any is nonzero, we're not rounding to even
+  lsls    r5, r6, #15      // check the bottom 17 bits of the low-order 32
+                           //   (enough to overlap r3 even if we renormalised)
+  beq     LOCAL_LABEL(rte)              // if any is nonzero, fall through, else RTE
+LOCAL_LABEL(rounded):
+  // Put on the sign and exponent, check for underflow and overflow, and
+  // return.
+  //
+  // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if
+  // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since
+  // this code doesn't report exceptions, we can ignore this case because it'll
+  // happen to return the right answer regardless). So we handle most of this
+  // via an unsigned comparison against 0xFF, which leaves the one case of a
+  // zero exponent that we have to filter separately by testing the Z flag
+  // after we shift the exponent back up into place.
+  cmp     r2, #0xFF    // check for most over/underflows
+  bhs     LOCAL_LABEL(outflow)      // ... and branch out of line for them
+  lsls    r5, r2, #23  // shift the exponent into its output location
+  beq     LOCAL_LABEL(outflow)      // ... and branch again if it was 0
+  lsls    r4, r4, #31  // shift the output sign into place
+  orrs    r0, r0, r4   // and OR it in to the output
+  adds    r0, r0, r5   // OR in the mantissa
+  pop     {r4,r5,r6,pc} // and return
+
+LOCAL_LABEL(rte):
+  // Out-of-line handler for the round-to-even case. Clear the low mantissa bit
+  // and go back to the post-rounding code.
+  movs    r5, #1
+  bics    r0, r0, r5
+  b       LOCAL_LABEL(rounded)
+
+LOCAL_LABEL(outflow):
+  cmp     r2, #0
+  bgt     LOCAL_LABEL(overflow)
+  // To handle underflow, we construct an intermediate value in the IEEE 754
+  // style (using our existing full-length mantissa, and bias the exponent by
+  // +0xC0), and indicate whether that intermediate was rounded up, down or not
+  // at all. Then call the helper function funder, which will denormalise and
+  // re-round correctly.
+  lsls    r1, r0, #7    // shift up the post-rounding mantissa
+  subs    r1, r3, r1    //   and subtract it from the pre-rounding version
+  lsls    r6, r6, #15
+  cmp     r6, #1        // if the rest of the low bits are nonzero
+  adcs    r1, r1, r1    //   then set an extra bit at the bottom
+
+  lsls    r4, r4, #31
+  orrs    r0, r0, r4    // put on the sign
+  adds    r2, r2, #192  // bias the exponent
+  lsls    r3, r2, #23
+  adds    r0, r0, r3    // put on the biased exponent
+
+  bl      SYMBOL_NAME(__compiler_rt_funder)
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(overflow):
+  // Handle overflow by returning an infinity of the correct sign.
+  lsls    r4, r4, #8    // move the sign up to bit 8
+  movs    r0, #0xff
+  orrs    r0, r0, r4    // fill in an exponent just below it
+  lsls    r0, r0, #23   // and shift those 9 bits up to the top of the word
+  pop     {r4,r5,r6,pc}
+
+  // We come here if there's at least one zero or denormal. On the fast path
+  // above, it was convenient to check these before checking NaNs and
+  // infinities, but NaNs take precedence, so now we're off the fast path, we
+  // must still check for those.
+  //
+  // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input
+  // exponents. So if we branched after shifting-and-checking r2, we come to
+  // this earlier entry point 'zerodenorm0' so that we still shift r3.
+LOCAL_LABEL(zerodenorm0):
+  lsrs    r3, r3, #24
+LOCAL_LABEL(zerodenorm):
+  cmp     r2, #255
+  beq     LOCAL_LABEL(naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(naninf)
+  // Now we know we have at least one zero or denormal, and no NaN or infinity.
+  // Check if either input is actually zero. We've ruled out 0 * infinity by
+  // this point, so any zero input means we return zero of the correct sign.
+  lsls    r6, r0, #1        // is one input zero?
+  beq     LOCAL_LABEL(zero)              // yes, go and return zero
+  lsls    r6, r1, #1        // is the other one zero?
+  bne     LOCAL_LABEL(denorm)            // if not, one must have been a denormal
+LOCAL_LABEL(zero):
+  lsls    r0, r4, #31    // shift up the output sign to make the return value
+  pop     {r4,r5,r6,pc}
+
+  // Handle denormals via the helper function fnorm2, which will break both
+  // inputs up into mantissa and exponent, renormalising and generating a
+  // negative exponent if necessary.
+LOCAL_LABEL(denorm):
+  push    {r0,r1,r2,r3}
+  mov     r0, sp
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0,r1,r2,r3}
+  // Convert fnorm2's return values into the right form to rejoin the main
+  // code path.
+  lsls    r0, r0, #1
+  lsls    r1, r1, #1
+  adds    r2, r2, r3
+  subs    r2, r2, #0x7f
+  b       LOCAL_LABEL(normalised)
+
+  // We come here if at least one input is a NaN or infinity. There may still
+  // be zeroes (or denormals, though they make no difference at this stage).
+LOCAL_LABEL(naninf):
+  movs    r6, #0xff
+  lsls    r6, r6, #24
+  lsls    r5, r0, #1
+  cmp     r5, r6
+  bhi     LOCAL_LABEL(nan)              // first operand is a NaN
+  lsls    r5, r1, #1
+  cmp     r5, r6
+  bhi     LOCAL_LABEL(nan)              // second operand is a NaN
+
+  // We know we have at least one infinity, and no NaNs. We might also have a
+  // zero, in which case we return the default quiet NaN.
+  lsls    r6, r0, #1
+  beq     LOCAL_LABEL(infzero)          // if r0 is a zero, r1 must be inf
+  lsls    r6, r1, #1
+  beq     LOCAL_LABEL(infzero)          // if r1 is a zero, r0 must be inf
+  // Otherwise we have infinity * infinity, or infinity * finite. Just return
+  // an appropriately signed infinity.
+  b       LOCAL_LABEL(overflow)         // reuse the code there
+
+  // We come here if at least one input is a NaN. Hand off to fnan2, which
+  // propagates an appropriate NaN to the output, dealing with the special
+  // cases of signalling/quiet NaNs.
+LOCAL_LABEL(nan):
+  bl      SYMBOL_NAME(__compiler_rt_fnan2)
+  pop     {r4,r5,r6,pc}
+
+  // Return a quiet NaN as the result of infinity * zero.
+LOCAL_LABEL(infzero):
+  ldr     r0, =0x7fc00000
+  pop     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__mulsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 63f4c94605c90..8e3cb35183ba7 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -35,6 +35,10 @@ if(APPLE)
   darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH)
 endif()
 
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP)
+endif()
+
 foreach(arch ${BUILTIN_TEST_ARCH})
   set(BUILTINS_TEST_TARGET_ARCH ${arch})
   string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX)
diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c
index f8cb6169ac283..12c5df5fdaae1 100644
--- a/compiler-rt/test/builtins/Unit/divsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/divsf3_test.c
@@ -1,115 +1,428 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_divsf3
 
 #include "int_lib.h"
+#include <inttypes.h>
 #include <stdio.h>
 
 #include "fp_test.h"
 
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
+// detailed handling of NaNs, we tighten up the check and include some extra
+// test cases specific to that NaN policy.
+#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
 // Returns: a / b
 COMPILER_RT_ABI float __divsf3(float a, float b);
 
-int test__divsf3(float a, float b, uint32_t expected)
-{
-    float x = __divsf3(a, b);
-    int ret = compareResultF(x, expected);
+int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __divsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) == expected_rep;
+#else
+  int ret = compareResultF(x, expected_rep);
+#endif
 
-    if (ret){
-        printf("error in test__divsf3(%.20e, %.20e) = %.20e, "
-               "expected %.20e\n", a, b, x,
-               fromRep32(expected));
-    }
-    return ret;
+  if (ret) {
+    printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
 }
 
-int main()
-{
-    // Returned NaNs are assumed to be qNaN by default
-
-    // qNaN / any = qNaN
-    if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000)))
-      return 1;
-    // NaN / any = NaN
-    if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000)))
-      return 1;
-    // any / qNaN = qNaN
-    if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000)))
-      return 1;
-    // any / NaN = NaN
-    if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000)))
-      return 1;
-
-    // +Inf / positive = +Inf
-    if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000)))
-      return 1;
-    // +Inf / negative = -Inf
-    if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000)))
-      return 1;
-    // -Inf / positive = -Inf
-    if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000)))
-      return 1;
-    // -Inf / negative = +Inf
-    if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // Inf / Inf = NaN
-    if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000)))
-      return 1;
-    // 0.0 / 0.0 = NaN
-    if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000)))
-      return 1;
-    // +0.0 / +Inf = +0.0
-    if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0)))
-      return 1;
-    // +Inf / +0.0 = +Inf
-    if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // positive / +0.0 = +Inf
-    if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-    // positive / -0.0 = -Inf
-    if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000)))
-      return 1;
-    // negative / +0.0 = -Inf
-    if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000)))
-      return 1;
-    // negative / -0.0 = +Inf
-    if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // 1/3
-    if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab)))
-      return 1;
-    // smallest normal result
-    if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000)))
-      return 1;
+int main(void) {
+  int status = 0;
 
-    // divisor is exactly 1.0
-    if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000)))
-      return 1;
-    // divisor is truncated to exactly 1.0 in UQ1.15
-    if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00)))
-      return 1;
+  status |= test__divsf3(0x00000000, 0x00000001, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x00800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x80000002, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x80800001, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x81000000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xff000000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008);
+  status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002);
+  status |= test__divsf3(0x00000001, 0x40000000, 0x00000000);
+  status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000);
+  status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000);
+  status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000);
+  status |= test__divsf3(0x00000002, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x00000002, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00000009, 0x41100000, 0x00000001);
+  status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001);
+  status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8);
+  status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff);
+  status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe);
+  status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe);
+  status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff);
+  status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd);
+  status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff);
+  status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff);
+  status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff);
+  status |= test__divsf3(0x00800001, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x00800001, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc);
+  status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff);
+  status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000);
+  status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x01000000, 0x00800000, 0x40000000);
+  status |= test__divsf3(0x01000000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000);
+  status |= test__divsf3(0x01000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x01000001, 0x00800001, 0x40000000);
+  status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001);
+  status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000);
+  status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003);
+  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd);
+  status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe);
+  status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001);
+  status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd);
+  status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7);
+  status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005);
+  status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001);
+  status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001);
+  status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000);
+  status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa);
+  status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8);
+  status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002);
+  status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002);
+  status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003);
+  status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003);
+  status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001);
+  status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005);
+  status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004);
+  status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004);
+  status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002);
+  status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005);
+  status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003);
+  status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa);
+  status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006);
+  status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005);
+  status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001);
+  status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000);
+  status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000);
+  status |= test__divsf3(0x40400000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000);
+  status |= test__divsf3(0x40400000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000);
+  status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x41000000, 0x40000000, 0x40800000);
+  status |= test__divsf3(0x41100000, 0x40400000, 0x40400000);
+  status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000);
+  status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd);
+  status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000);
+  status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000);
+  status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000);
+  status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000);
+  status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd);
+  status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd);
+  status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000);
+  status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000);
+  status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000);
+  status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000);
+  status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff);
+  status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000);
+  status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000);
+  status |= test__divsf3(0x80000000, 0x00000003, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x00800001, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x01000000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x40000000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x80000004, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x80800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002);
+  status |= test__divsf3(0x80000001, 0x40000000, 0x80000000);
+  status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000);
+  status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000);
+  status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000);
+  status |= test__divsf3(0x80000003, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80000004, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9);
+  status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff);
+  status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80800001, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x81000000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000);
+  status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000);
+  status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000);
+  status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000);
+  status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000);
+  status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000);
+  status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000);
+  status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000);
+  status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000);
+  status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000);
+  status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff);
+  status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000);
+  status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000);
+  status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000);
+  status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd);
+  status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd);
+  status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000);
+  status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000);
+  status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xff800000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x00000003, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x00800001, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x01000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x40000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000);
+  status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953);
+  status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781);
 
-    // smallest normal value divided by 2.0
-    if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000)))
-      return 1;
-    // smallest subnormal result
-    if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001)))
-      return 1;
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
 
-    // some misc test cases obtained by fuzzing against h/w implementation
-    if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006)))
-      return 1;
-    if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000)))
-      return 1;
-    if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000)))
-      return 1;
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/divsf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
 
-    // test 1 / (1 - eps(0.5)) = 1 + eps(1)
-    if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001)))
-      return 1;
+  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
+#endif // ARM_NAN_HANDLING
 
-    return 0;
+  return status;
 }
diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c
new file mode 100644
index 0000000000000..7dc7c8ad39c32
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c
@@ -0,0 +1,616 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_mulsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
+// detailed handling of NaNs, we tighten up the check and include some extra
+// test cases specific to that NaN policy.
+#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a * b
+COMPILER_RT_ABI float __mulsf3(float a, float b);
+
+int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __mulsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) == expected_rep;
+#else
+  int ret = compareResultF(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
+}
+
+int main(void) {
+  int status = 0;
+
+  status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000);
+  status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001);
+  status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001);
+  status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002);
+  status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001);
+  status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003);
+  status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003);
+  status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001);
+  status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000);
+  status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9);
+  status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000);
+  status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9);
+  status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000);
+  status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8);
+  status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc);
+  status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff);
+  status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff);
+  status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000);
+  status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000);
+  status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe);
+  status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000);
+  status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff);
+  status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000);
+  status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000);
+  status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe);
+  status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000);
+  status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001);
+  status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000);
+  status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc);
+  status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff);
+  status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff);
+  status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000);
+  status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000);
+  status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000);
+  status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd);
+  status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003);
+  status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000);
+  status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000);
+  status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001);
+  status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009);
+  status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002);
+  status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002);
+  status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003);
+  status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003);
+  status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000);
+  status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001);
+  status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000);
+  status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000);
+  status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000);
+  status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff);
+  status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa);
+  status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003);
+  status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005);
+  status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000);
+  status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd);
+  status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003);
+  status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd);
+  status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe);
+  status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe);
+  status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd);
+  status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006);
+  status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000);
+  status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000);
+  status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000);
+  status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003);
+  status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006);
+  status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006);
+  status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008);
+  status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff);
+  status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff);
+  status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe);
+  status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002);
+  status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc);
+  status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe);
+  status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000);
+  status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000);
+  status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003);
+  status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd);
+  status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd);
+  status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000);
+  status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000);
+  status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff);
+  status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000);
+  status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000);
+  status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000);
+  status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000);
+  status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000);
+  status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000);
+  status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000);
+  status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000);
+  status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005);
+  status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002);
+  status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004);
+  status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001);
+  status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000);
+  status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8);
+  status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa);
+  status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000);
+  status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000);
+  status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff);
+  status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000);
+  status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000);
+  status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001);
+  status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe);
+  status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe);
+  status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe);
+  status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe);
+  status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009);
+  status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009);
+  status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000);
+  status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000);
+  status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000);
+  status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000);
+  status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002);
+  status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002);
+  status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003);
+  status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003);
+  status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003);
+  status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa);
+  status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001);
+  status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005);
+  status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009);
+  status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000);
+  status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff);
+  status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001);
+  status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000);
+  status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000);
+  status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe);
+  status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000);
+  status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000);
+  status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000);
+  status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003);
+  status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff);
+  status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff);
+  status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe);
+  status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002);
+  status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8);
+  status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd);
+  status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd);
+  status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000);
+  status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000);
+  status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000);
+  status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000);
+  status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000);
+  status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000);
+  status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000);
+  status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000);
+  status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f);
+  status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d);
+  status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e);
+  status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa);
+  status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e);
+  status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5);
+  status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f);
+  status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d);
+  status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e);
+  status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa);
+  status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e);
+  status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5);
+  status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f);
+  status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d);
+  status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e);
+  status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa);
+  status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e);
+  status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5);
+  status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f);
+  status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d);
+  status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e);
+  status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa);
+  status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e);
+  status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5);
+  status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000);
+  status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001);
+  status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200);
+  status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201);
+  status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85);
+  status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c);
+  status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c);
+  status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d);
+  status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e);
+  status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca);
+  status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000);
+  status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265);
+  status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5);
+  status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000);
+  status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263);
+  status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b);
+  status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000);
+  status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421);
+  status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036);
+  status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000);
+  status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b);
+  status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489);
+  status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000);
+  status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244);
+  status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea);
+  status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000);
+  status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75);
+  status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be);
+  status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000);
+  status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f);
+  status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52);
+  status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000);
+  status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9);
+  status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d);
+  status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000);
+  status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe);
+  status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a);
+  status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000);
+  status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295);
+  status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312);
+  status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989);
+  status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a);
+  status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d);
+  status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3);
+  status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1);
+  status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7);
+  status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574);
+  status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f);
+  status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418);
+  status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886);
+  status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6);
+  status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654);
+  status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a);
+  status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014);
+  status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000);
+  status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a);
+  status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670);
+  status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000);
+  status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338);
+  status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27);
+  status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000);
+  status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794);
+  status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8);
+  status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000);
+  status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc);
+  status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3);
+  status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000);
+  status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2);
+  status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c);
+  status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e);
+  status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0);
+  status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000);
+  status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0);
+  status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9);
+  status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000);
+  status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c);
+  status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b);
+  status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000);
+  status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5);
+  status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987);
+  status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000);
+  status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4);
+  status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2);
+  status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000);
+  status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9);
+  status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3);
+  status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581);
+  status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1);
+  status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000);
+  status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058);
+  status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9);
+  status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000);
+  status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5);
+  status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8);
+  status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000);
+  status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc);
+  status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec);
+  status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6);
+  status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9);
+  status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000);
+  status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264);
+  status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6);
+  status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000);
+  status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3);
+  status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27);
+  status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000);
+  status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794);
+  status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e);
+  status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000);
+  status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97);
+  status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47);
+  status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524);
+  status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe);
+  status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff);
+  status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc);
+  status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66);
+  status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c);
+  status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e);
+  status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b);
+  status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d);
+  status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856);
+  status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000);
+  status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b);
+  status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68);
+  status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000);
+  status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4);
+  status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae);
+  status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000);
+  status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057);
+  status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47);
+  status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000);
+  status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23);
+  status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6);
+  status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000);
+  status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b);
+  status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95);
+  status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000);
+  status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb);
+  status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000);
+  status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007);
+  status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f);
+  status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00);
+  status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21);
+  status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001);
+  status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001);
+  status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
+  status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/mulsf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
+
+  status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}

From 12322b22c68a588caeee8702946695de0a8ba788 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 13 Nov 2025 16:33:18 +0000
Subject: [PATCH 08/25] [AArch64][SVE] Allow basic use of
 `target("aarch64.svcount")` with +sve  (#167875)

This prevents the backend from crashing for basic uses of __SVCount_t
type (e.g., as function arguments), without +sve2p1 or +sme2.

Fixes #167462
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 20 ++++++++++---------
 .../CodeGen/AArch64/sme-aarch64-svcount.ll    |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b51f453b4974..13339818b0b68 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -445,6 +445,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
     addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
 
+    // Add sve predicate as counter type
+    addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
+
     // Add legal sve data types
     addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
@@ -473,15 +476,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
-    addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
-    setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
-    setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
-
-    setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
-  }
-
   // Compute derived properties from the register classes
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
@@ -1609,6 +1603,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
            MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
 
+    // Promote predicate as counter load/stores to standard predicates.
+    setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
+    setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
+
+    // Predicate as counter legalization actions.
+    setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
+
     for (auto VT :
          {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
diff --git a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
index aee705f0be9b9..ecbf0bdb06d15 100644
--- a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+++ b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=aarch64 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-O3
+; RUN: llc -O0 -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-O0
+; RUN: llc -O3 -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-O3
 
 ;
 ; Test simple loads, stores and return.

From c78fb8dfb8fd586612f8e5e31d54f600d1c5cdc8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 13 Nov 2025 08:38:01 -0800
Subject: [PATCH 09/25] [ADT] Simplify SmallDenseMap::grow (NFC) (#167829)

Without this patch, SmallDenseMap::grow has two separate code paths to
grow the bucket array.  The code path to handle the small mode has its
own traversal over the bucket array.  This patch simplifies this logic
as follows:

1. Allocate a temporary instance of SmallDenseMap.
2. Move valid key/value pairs to the temporary instance.
3. Move LargeRep to *this.

Remarks:

- This patch adds moveFromImpl to move key/value pairs.
  moveFromOldBuckets is updated to use the new helper function.

- This patch adds a private constructor to SmallDenseMap that takes an
  exact number of buckets, accompanied by tag ExactBucketCount.

- This patch adds a fast path to deallocateBuckets in case
  getLargeRep()->NumBuckets == 0, just like destroyAll.  This path is
  used to destruct zombie instances after moves.

- In somewhat rare cases, we "grow" from the small mode to the small
  mode when there are many tombstones in the inline storage.  This is
  handled with another call to moveFrom.
---
 llvm/include/llvm/ADT/DenseMap.h | 75 ++++++++++++++------------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index d5b13e7731550..9d61a91631fab 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -413,9 +413,7 @@ class DenseMapBase : public DebugEpochBase {
     return NextPowerOf2(NumEntries * 4 / 3 + 1);
   }
 
-  void moveFromOldBuckets(iterator_range<BucketT *> OldBuckets) {
-    initEmpty();
-
+  void moveFromImpl(iterator_range<BucketT *> OldBuckets) {
     // Insert all the old elements.
     const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
@@ -438,6 +436,14 @@ class DenseMapBase : public DebugEpochBase {
     }
   }
 
+  void moveFromOldBuckets(iterator_range<BucketT *> OldBuckets) {
+    initEmpty();
+    moveFromImpl(OldBuckets);
+  }
+
+  // Move key/value from Other to *this.  Other will be in a zombie state.
+  void moveFrom(DerivedT &Other) { moveFromImpl(Other.buckets()); }
+
   void copyFrom(const DerivedT &other) {
     this->destroyAll();
     derived().deallocateBuckets();
@@ -889,6 +895,12 @@ class SmallDenseMap
   /// a large bucket. This union will be discriminated by the 'Small' bit.
   AlignedCharArrayUnion<BucketT[InlineBuckets], LargeRep> storage;
 
+  struct ExactBucketCount {};
+  SmallDenseMap(unsigned NumBuckets, ExactBucketCount) {
+    allocateBuckets(NumBuckets);
+    this->BaseT::initEmpty();
+  }
+
 public:
   explicit SmallDenseMap(unsigned NumElementsToReserve = 0) {
     init(NumElementsToReserve);
@@ -1065,7 +1077,10 @@ class SmallDenseMap
   }
 
   void deallocateBuckets() {
-    if (Small)
+    // Fast path to deallocateBuckets in case getLargeRep()->NumBuckets == 0,
+    // just like destroyAll.  This path is used to destruct zombie instances
+    // after moves.
+    if (Small || getLargeRep()->NumBuckets == 0)
       return;
 
     deallocate_buffer(getLargeRep()->Buckets,
@@ -1096,46 +1111,22 @@ class SmallDenseMap
     if (AtLeast > InlineBuckets)
       AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
 
-    if (Small) {
-      // First move the inline buckets into a temporary storage.
-      AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
-      BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
-      BucketT *TmpEnd = TmpBegin;
+    SmallDenseMap Tmp(AtLeast, ExactBucketCount{});
+    Tmp.moveFrom(*this);
 
-      // Loop over the buckets, moving non-empty, non-tombstones into the
-      // temporary storage. Have the loop move the TmpEnd forward as it goes.
-      const KeyT EmptyKey = KeyInfoT::getEmptyKey();
-      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
-      for (BucketT &B : inlineBuckets()) {
-        if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
-            !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
-          assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
-                 "Too many inline buckets!");
-          ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
-          ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
-          ++TmpEnd;
-          B.getSecond().~ValueT();
-        }
-        B.getFirst().~KeyT();
-      }
-
-      // AtLeast == InlineBuckets can happen if there are many tombstones,
-      // and grow() is used to remove them. Usually we always switch to the
-      // large rep here.
-      allocateBuckets(AtLeast);
-      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
-      return;
+    if (Tmp.Small) {
+      // Use moveFrom in those rare cases where we stay in the small mode.  This
+      // can happen when we have many tombstones.
+      this->BaseT::initEmpty();
+      this->moveFrom(Tmp);
+      Tmp.Small = false;
+      Tmp.getLargeRep()->NumBuckets = 0;
+    } else {
+      Small = false;
+      NumTombstones = 0;
+      *getLargeRep() = std::move(*Tmp.getLargeRep());
+      Tmp.getLargeRep()->NumBuckets = 0;
     }
-
-    LargeRep OldRep = std::move(*getLargeRep());
-    getLargeRep()->~LargeRep();
-    allocateBuckets(AtLeast);
-
-    this->moveFromOldBuckets(OldRep.buckets());
-
-    // Free the old table.
-    deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
-                      alignof(BucketT));
   }
 
   // Plan how to shrink the bucket table.  Return:

From 0acdbd5d81c0aaaad018adff8857d08502f4beac Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 13 Nov 2025 08:38:35 -0800
Subject: [PATCH 10/25] [InstrRef] Consistently use MLocTracker::getLocID()
 before calling lookupOrTrackRegister (#167841)

The LocID for registers is just the register ID. The getLocID function
is supposed to hide this detail, but it wasn't being used consistently.

This avoids a bunch of implicit casts from Register or MCRegister to
unsigned.
---
 .../LiveDebugValues/InstrRefBasedImpl.cpp     | 33 ++++++++++--------
 llvm/unittests/CodeGen/InstrRefLDVTest.cpp    | 34 +++++++++----------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 0037bdd270ff3..6dda0fddbcec8 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -1603,8 +1603,8 @@ std::optional<ValueIDNum> InstrRefBasedLDV::getValueForInstrRef(
       unsigned MainRegSize = TRI->getRegSizeInBits(*TRC);
       if (Size != MainRegSize || Offset) {
         // Enumerate all subregisters, searching.
-        Register NewReg = 0;
-        for (MCPhysReg SR : TRI->subregs(Reg)) {
+        Register NewReg = Register();
+        for (MCRegister SR : TRI->subregs(Reg)) {
           unsigned Subreg = TRI->getSubRegIndex(Reg, SR);
           unsigned SubregSize = TRI->getSubRegIdxSize(Subreg);
           unsigned SubregOffset = TRI->getSubRegIdxOffset(Subreg);
@@ -1620,7 +1620,8 @@ std::optional<ValueIDNum> InstrRefBasedLDV::getValueForInstrRef(
         } else {
           // Re-state the value as being defined within the subregister
           // that we found.
-          LocIdx NewLoc = MTracker->lookupOrTrackRegister(NewReg);
+          LocIdx NewLoc =
+              MTracker->lookupOrTrackRegister(MTracker->getLocID(NewReg));
           NewID = ValueIDNum(NewID->getBlock(), NewID->getInst(), NewLoc);
         }
       }
@@ -1818,12 +1819,13 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
     Register Reg = MO.getReg();
     ValueIDNum Num = MTracker->readReg(Reg);
     auto PHIRec = DebugPHIRecord(
-        {InstrNum, MI.getParent(), Num, MTracker->lookupOrTrackRegister(Reg)});
+        {InstrNum, MI.getParent(), Num,
+         MTracker->lookupOrTrackRegister(MTracker->getLocID(Reg))});
     DebugPHINumToValue.push_back(PHIRec);
 
     // Ensure this register is tracked.
     for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
-      MTracker->lookupOrTrackRegister(*RAI);
+      MTracker->lookupOrTrackRegister(MTracker->getLocID(*RAI));
   } else if (MO.isFI()) {
     // The value is whatever's in this stack slot.
     unsigned FI = MO.getIndex();
@@ -1949,8 +1951,8 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
   // different location.
 
   // Inform TTracker about any direct clobbers.
-  for (uint32_t DeadReg : DeadRegs) {
-    LocIdx Loc = MTracker->lookupOrTrackRegister(DeadReg);
+  for (MCRegister DeadReg : DeadRegs) {
+    LocIdx Loc = MTracker->lookupOrTrackRegister(MTracker->getLocID(DeadReg));
     TTracker->clobberMloc(Loc, MI.getIterator(), false);
   }
 
@@ -1995,9 +1997,9 @@ void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
 
   // Copy subregisters from one location to another.
   for (MCSubRegIndexIterator SRI(SrcRegNum, TRI); SRI.isValid(); ++SRI) {
-    unsigned SrcSubReg = SRI.getSubReg();
+    MCRegister SrcSubReg = SRI.getSubReg();
     unsigned SubRegIdx = SRI.getSubRegIndex();
-    unsigned DstSubReg = TRI->getSubReg(DstRegNum, SubRegIdx);
+    MCRegister DstSubReg = TRI->getSubReg(DstRegNum, SubRegIdx);
     if (!DstSubReg)
       continue;
 
@@ -2006,8 +2008,10 @@ void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
     // yet.
     // This will force SrcSubReg to be tracked, if it isn't yet. Will read
     // mphi values if it wasn't tracked.
-    LocIdx SrcL = MTracker->lookupOrTrackRegister(SrcSubReg);
-    LocIdx DstL = MTracker->lookupOrTrackRegister(DstSubReg);
+    LocIdx SrcL =
+        MTracker->lookupOrTrackRegister(MTracker->getLocID(SrcSubReg));
+    LocIdx DstL =
+        MTracker->lookupOrTrackRegister(MTracker->getLocID(DstSubReg));
     (void)SrcL;
     (void)DstL;
     ValueIDNum CpyValue = MTracker->readReg(SrcSubReg);
@@ -2130,7 +2134,7 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
     // Then, transfer subreg bits.
     for (MCPhysReg SR : TRI->subregs(Reg)) {
       // Ensure this reg is tracked,
-      (void)MTracker->lookupOrTrackRegister(SR);
+      (void)MTracker->lookupOrTrackRegister(MTracker->getLocID(SR));
       unsigned SubregIdx = TRI->getSubRegIndex(Reg, SR);
       unsigned SpillID = MTracker->getLocID(Loc, SubregIdx);
       DoTransfer(SR, SpillID);
@@ -2662,7 +2666,7 @@ void InstrRefBasedLDV::placeMLocPHIs(
 
   // For reg units, place PHIs, and then place them for any aliasing registers.
   for (Register R : RegUnitsToPHIUp) {
-    LocIdx L = MTracker->lookupOrTrackRegister(R);
+    LocIdx L = MTracker->lookupOrTrackRegister(MTracker->getLocID(R));
     CollectPHIsForLoc(L);
 
     // Install those PHI values into the live-in value array.
@@ -2675,7 +2679,8 @@ void InstrRefBasedLDV::placeMLocPHIs(
       if (!MTracker->isRegisterTracked(*RAI))
         continue;
 
-      LocIdx AliasLoc = MTracker->lookupOrTrackRegister(*RAI);
+      LocIdx AliasLoc =
+          MTracker->lookupOrTrackRegister(MTracker->getLocID(*RAI));
       InstallPHIsAtLoc(AliasLoc);
     }
   }
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 235a53dcc156e..5211a6c8ef416 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -955,7 +955,7 @@ TEST_F(InstrRefLDVTest, MLocSingleBlock) {
   // Add a new register to be tracked, and insert it into the transfer function
   // as a copy. The output of $rax should be the live-in value of $rsp.
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
   TransferFunc[0].insert({RspLoc, ValueIDNum(0, 1, RspLoc)});
   TransferFunc[0].insert({RaxLoc, ValueIDNum(0, 0, RspLoc)});
   initValueArray(MInLocs, 1, 2);
@@ -980,7 +980,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondBlocks) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
@@ -1194,7 +1194,7 @@ TEST_F(InstrRefLDVTest, MLocSimpleLoop) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
@@ -1292,7 +1292,7 @@ TEST_F(InstrRefLDVTest, MLocNestedLoop) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
@@ -1493,7 +1493,7 @@ TEST_F(InstrRefLDVTest, MLocNoDominatingLoop) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
@@ -1648,7 +1648,7 @@ TEST_F(InstrRefLDVTest, MLocBadlyNestedLoops) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
@@ -1780,7 +1780,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocDiamond) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
@@ -1976,7 +1976,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocLoops) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
@@ -2104,9 +2104,9 @@ TEST_F(InstrRefLDVTest, pickVPHILocBadlyNestedLoops) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
   Register RBX = getRegByName("RBX");
-  LocIdx RbxLoc = MTracker->lookupOrTrackRegister(RBX);
+  LocIdx RbxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RBX));
 
   auto [MInLocs, MOutLocs] = allocValueTables(5, 3);
 
@@ -2256,7 +2256,7 @@ TEST_F(InstrRefLDVTest, vlocJoinDiamond) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  MTracker->lookupOrTrackRegister(RAX);
+  MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   DbgOpID LiveInRspID = DbgOpID(false, 0);
   DbgOpID LiveInRaxID = DbgOpID(false, 1);
@@ -2440,7 +2440,7 @@ TEST_F(InstrRefLDVTest, vlocJoinLoops) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  MTracker->lookupOrTrackRegister(RAX);
+  MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   DbgOpID LiveInRspID = DbgOpID(false, 0);
   DbgOpID LiveInRaxID = DbgOpID(false, 1);
@@ -2538,9 +2538,9 @@ TEST_F(InstrRefLDVTest, vlocJoinBadlyNestedLoops) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  MTracker->lookupOrTrackRegister(RAX);
+  MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
   Register RBX = getRegByName("RBX");
-  MTracker->lookupOrTrackRegister(RBX);
+  MTracker->lookupOrTrackRegister(MTracker->getLocID(RBX));
 
   DbgOpID LiveInRspID = DbgOpID(false, 0);
   DbgOpID LiveInRaxID = DbgOpID(false, 1);
@@ -2678,7 +2678,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   unsigned EntryBlk = 0, RetBlk = 3;
 
@@ -2896,7 +2896,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   unsigned EntryBlk = 0, LoopBlk = 1;
 
@@ -3175,7 +3175,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
   Register RAX = getRegByName("RAX");
-  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
+  LocIdx RaxLoc = MTracker->lookupOrTrackRegister(MTracker->getLocID(RAX));
 
   unsigned EntryBlk = 0, Loop1Blk = 1, Loop2Blk = 2;
 

From d6703bbe18536a747f1d25f6910cad44dd2db652 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Thu, 13 Nov 2025 11:41:31 -0500
Subject: [PATCH 11/25] [GISel][AArch64] Create emitCMP instead of cloning a
 virtual register (NFC) (#155262)

CMN also has a function like this, we should do the same with CMP.
---
 .../GISel/AArch64InstructionSelector.cpp        | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 394024693194c..64db3765c433f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -310,6 +310,8 @@ class AArch64InstructionSelector : public InstructionSelector {
                          MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
                          MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitCMP(MachineOperand &LHS, MachineOperand &RHS,
+                        MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
                         MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
@@ -4412,6 +4414,15 @@ AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
 }
 
+MachineInstr *
+AArch64InstructionSelector::emitCMP(MachineOperand &LHS, MachineOperand &RHS,
+                                    MachineIRBuilder &MIRBuilder) const {
+  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+  bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
+  auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
+  return emitSUBS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
+}
+
 MachineInstr *
 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
                                     MachineIRBuilder &MIRBuilder) const {
@@ -4464,8 +4475,7 @@ MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
   // Fold the compare into a cmn or tst if possible.
   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
     return FoldCmp;
-  auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
-  return emitSUBS(Dst, LHS, RHS, MIRBuilder);
+  return emitCMP(LHS, RHS, MIRBuilder);
 }
 
 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
@@ -4870,9 +4880,8 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
 
     // Produce a normal comparison if we are first in the chain
     if (!CCOp) {
-      auto Dst = MRI.cloneVirtualRegister(LHS);
       if (isa<GICmp>(Cmp))
-        return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
+        return emitCMP(Cmp->getOperand(2), Cmp->getOperand(3), MIB);
       return emitFPCompare(Cmp->getOperand(2).getReg(),
                            Cmp->getOperand(3).getReg(), MIB);
     }

From 89c08ad2e27319688fbc5121b6f78bf1171e109e Mon Sep 17 00:00:00 2001
From: David Peixotto <peix@meta.com>
Date: Thu, 13 Nov 2025 08:42:34 -0800
Subject: [PATCH 12/25] [lldb] Add a gtest matcher for lldb_private::Value
 (#167427)

This commit adds a new `ValueMatcher` class that can be used in gtest
matching contexts to match against `lldb_private::Value` objects. We
always match against the values `value_type` and `context_type`. For
HostAddress values we will also match against the expected host buffer
contents. For Scalar, FileAddress, and LoadAddress values we match
against an expected Scalar value.

The matcher is used to improve the quality of the tests in the
`DwarfExpressionTest.cpp` file. Previously, the local `Evaluate`
function would return an `Expected<Scalar>` value which makes it hard to
verify that we actually get a Value of the expected type without adding
custom evaluation code. Now we return an `Expected<Value>` so that we
can match against the full value contents.

The resulting change improves the quality of the existing checks and in
some cases eliminates the need for special code to explicitly check
value types.

I followed the gtest
[guide](https://google.github.io/googletest/gmock_cook_book.html#writing-new-monomorphic-matchers)
for writing a new value matcher.
---
 lldb/unittests/Expression/CMakeLists.txt      |   1 +
 .../Expression/DWARFExpressionTest.cpp        | 202 +++++------------
 lldb/unittests/Expression/ValueMatcher.cpp    | 205 ++++++++++++++++++
 lldb/unittests/Expression/ValueMatcher.h      | 155 +++++++++++++
 4 files changed, 421 insertions(+), 142 deletions(-)
 create mode 100644 lldb/unittests/Expression/ValueMatcher.cpp
 create mode 100644 lldb/unittests/Expression/ValueMatcher.h

diff --git a/lldb/unittests/Expression/CMakeLists.txt b/lldb/unittests/Expression/CMakeLists.txt
index 2600557b6b376..0e0b002500eb4 100644
--- a/lldb/unittests/Expression/CMakeLists.txt
+++ b/lldb/unittests/Expression/CMakeLists.txt
@@ -10,6 +10,7 @@ add_lldb_unittest(ExpressionTests
   DWARFExpressionTest.cpp
   CppModuleConfigurationTest.cpp
   ExpressionTest.cpp
+  ValueMatcher.cpp
 
   LINK_COMPONENTS
     Support
diff --git a/lldb/unittests/Expression/DWARFExpressionTest.cpp b/lldb/unittests/Expression/DWARFExpressionTest.cpp
index 9d11060becfae..8c5568d9e4e65 100644
--- a/lldb/unittests/Expression/DWARFExpressionTest.cpp
+++ b/lldb/unittests/Expression/DWARFExpressionTest.cpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
 #include "lldb/Expression/DWARFExpression.h"
+#include "ValueMatcher.h"
 #ifdef ARCH_AARCH64
 #include "Plugins/ABI/AArch64/ABISysV_arm64.h"
 #endif
@@ -135,40 +135,18 @@ class MockRegisterContext : public RegisterContext {
 };
 } // namespace
 
-static llvm::Expected<Scalar> Evaluate(llvm::ArrayRef<uint8_t> expr,
-                                       lldb::ModuleSP module_sp = {},
-                                       DWARFUnit *unit = nullptr,
-                                       ExecutionContext *exe_ctx = nullptr,
-                                       RegisterContext *reg_ctx = nullptr) {
+static llvm::Expected<Value> Evaluate(llvm::ArrayRef<uint8_t> expr,
+                                      lldb::ModuleSP module_sp = {},
+                                      DWARFUnit *unit = nullptr,
+                                      ExecutionContext *exe_ctx = nullptr,
+                                      RegisterContext *reg_ctx = nullptr) {
   DataExtractor extractor(expr.data(), expr.size(), lldb::eByteOrderLittle,
                           /*addr_size*/ 4);
 
-  llvm::Expected<Value> result = DWARFExpression::Evaluate(
-      exe_ctx, reg_ctx, module_sp, extractor, unit, lldb::eRegisterKindLLDB,
-      /*initial_value_ptr=*/nullptr,
-      /*object_address_ptr=*/nullptr);
-  if (!result)
-    return result.takeError();
-
-  switch (result->GetValueType()) {
-  case Value::ValueType::Scalar:
-    return result->GetScalar();
-  case Value::ValueType::LoadAddress:
-    return LLDB_INVALID_ADDRESS;
-  case Value::ValueType::HostAddress: {
-    // Convert small buffers to scalars to simplify the tests.
-    DataBufferHeap &buf = result->GetBuffer();
-    if (buf.GetByteSize() <= 8) {
-      uint64_t val = 0;
-      memcpy(&val, buf.GetBytes(), buf.GetByteSize());
-      return Scalar(llvm::APInt(buf.GetByteSize() * 8, val, false));
-    }
-  }
-    [[fallthrough]];
-  default:
-    break;
-  }
-  return llvm::createStringError("unsupported value type");
+  return DWARFExpression::Evaluate(exe_ctx, reg_ctx, module_sp, extractor, unit,
+                                   lldb::eRegisterKindLLDB,
+                                   /*initial_value_ptr=*/nullptr,
+                                   /*object_address_ptr=*/nullptr);
 }
 
 class DWARFExpressionTester : public YAMLModuleTester {
@@ -177,18 +155,11 @@ class DWARFExpressionTester : public YAMLModuleTester {
       : YAMLModuleTester(yaml_data, cu_index) {}
 
   using YAMLModuleTester::YAMLModuleTester;
-  llvm::Expected<Scalar> Eval(llvm::ArrayRef<uint8_t> expr) {
+  llvm::Expected<Value> Eval(llvm::ArrayRef<uint8_t> expr) {
     return ::Evaluate(expr, m_module_sp, m_dwarf_unit);
   }
 };
 
-/// Unfortunately Scalar's operator==() is really picky.
-static Scalar GetScalar(unsigned bits, uint64_t value, bool sign) {
-  Scalar scalar(value);
-  scalar.TruncOrExtendTo(bits, sign);
-  return scalar;
-}
-
 /// This is needed for the tests that use a mock process.
 class DWARFExpressionMockProcessTest : public ::testing::Test {
 public:
@@ -255,48 +226,48 @@ class MockTarget : public Target {
 
 TEST(DWARFExpression, DW_OP_pick) {
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit1, DW_OP_lit0, DW_OP_pick, 0}),
-                       llvm::HasValue(0));
+                       ExpectScalar(0));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit1, DW_OP_lit0, DW_OP_pick, 1}),
-                       llvm::HasValue(1));
+                       ExpectScalar(1));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit1, DW_OP_lit0, DW_OP_pick, 2}),
                        llvm::Failed());
 }
 
 TEST(DWARFExpression, DW_OP_const) {
   // Extend to address size.
-  EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const1u, 0x88}), llvm::HasValue(0x88));
+  EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const1u, 0x88}), ExpectScalar(0x88));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const1s, 0x88}),
-                       llvm::HasValue(0xffffff88));
+                       ExpectScalar(0xffffff88));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const2u, 0x47, 0x88}),
-                       llvm::HasValue(0x8847));
+                       ExpectScalar(0x8847));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const2s, 0x47, 0x88}),
-                       llvm::HasValue(0xffff8847));
+                       ExpectScalar(0xffff8847));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const4u, 0x44, 0x42, 0x47, 0x88}),
-                       llvm::HasValue(0x88474244));
+                       ExpectScalar(0x88474244));
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const4s, 0x44, 0x42, 0x47, 0x88}),
-                       llvm::HasValue(0x88474244));
+                       ExpectScalar(0x88474244));
 
   // Truncate to address size.
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_const8u, 0x00, 0x11, 0x22, 0x33, 0x44, 0x42, 0x47, 0x88}),
-      llvm::HasValue(0x33221100));
+      ExpectScalar(0x33221100));
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_const8s, 0x00, 0x11, 0x22, 0x33, 0x44, 0x42, 0x47, 0x88}),
-      llvm::HasValue(0x33221100));
+      ExpectScalar(0x33221100));
 
   // Don't truncate to address size for compatibility with clang (pr48087).
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_constu, 0x81, 0x82, 0x84, 0x88, 0x90, 0xa0, 0x40}),
-      llvm::HasValue(0x01010101010101));
+      ExpectScalar(0x01010101010101));
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_consts, 0x81, 0x82, 0x84, 0x88, 0x90, 0xa0, 0x40}),
-      llvm::HasValue(0xffff010101010101));
+      ExpectScalar(0xffff010101010101));
 }
 
 TEST(DWARFExpression, DW_OP_skip) {
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const1u, 0x42, DW_OP_skip, 0x02, 0x00,
                                  DW_OP_const1u, 0xff}),
-                       llvm::HasValue(0x42));
+                       ExpectScalar(0x42));
 }
 
 TEST(DWARFExpression, DW_OP_bra) {
@@ -309,7 +280,7 @@ TEST(DWARFExpression, DW_OP_bra) {
         DW_OP_const1u, 0xff,     // push 0xff
       }),
       // clang-format on
-      llvm::HasValue(0x42));
+      ExpectScalar(0x42));
 
   EXPECT_THAT_ERROR(Evaluate({DW_OP_bra, 0x01, 0x00}).takeError(),
                     llvm::Failed());
@@ -414,42 +385,42 @@ TEST(DWARFExpression, DW_OP_convert) {
   EXPECT_THAT_EXPECTED(
       t.Eval({DW_OP_const4u, 0x11, 0x22, 0x33, 0x44, //
               DW_OP_convert, offs_uint32_t, DW_OP_stack_value}),
-      llvm::HasValue(GetScalar(64, 0x44332211, not_signed)));
+      ExpectScalar(64, 0x44332211, not_signed));
 
   // Zero-extend to 64 bits.
   EXPECT_THAT_EXPECTED(
       t.Eval({DW_OP_const4u, 0x11, 0x22, 0x33, 0x44, //
               DW_OP_convert, offs_uint64_t, DW_OP_stack_value}),
-      llvm::HasValue(GetScalar(64, 0x44332211, not_signed)));
+      ExpectScalar(64, 0x44332211, not_signed));
 
   // Sign-extend to 64 bits.
   EXPECT_THAT_EXPECTED(
       t.Eval({DW_OP_const4s, 0xcc, 0xdd, 0xee, 0xff, //
               DW_OP_convert, offs_sint64_t, DW_OP_stack_value}),
-      llvm::HasValue(GetScalar(64, 0xffffffffffeeddcc, is_signed)));
+      ExpectScalar(64, 0xffffffffffeeddcc, is_signed));
 
   // Sign-extend, then truncate.
   EXPECT_THAT_EXPECTED(
       t.Eval({DW_OP_const4s, 0xcc, 0xdd, 0xee, 0xff, //
               DW_OP_convert, offs_sint64_t,          //
               DW_OP_convert, offs_uint32_t, DW_OP_stack_value}),
-      llvm::HasValue(GetScalar(32, 0xffeeddcc, not_signed)));
+      ExpectScalar(32, 0xffeeddcc, not_signed));
 
   // Truncate to default unspecified (pointer-sized) type.
   EXPECT_THAT_EXPECTED(t.Eval({DW_OP_const4s, 0xcc, 0xdd, 0xee, 0xff, //
                                DW_OP_convert, offs_sint64_t,          //
                                DW_OP_convert, 0x00, DW_OP_stack_value}),
-                       llvm::HasValue(GetScalar(32, 0xffeeddcc, not_signed)));
+                       ExpectScalar(32, 0xffeeddcc, not_signed));
 
   // Truncate to 8 bits.
   EXPECT_THAT_EXPECTED(t.Eval({DW_OP_const4s, 'A', 'B', 'C', 'D', DW_OP_convert,
                                offs_uchar, DW_OP_stack_value}),
-                       llvm::HasValue(GetScalar(8, 'A', not_signed)));
+                       ExpectScalar(8, 'A', not_signed));
 
   // Also truncate to 8 bits.
   EXPECT_THAT_EXPECTED(t.Eval({DW_OP_const4s, 'A', 'B', 'C', 'D', DW_OP_convert,
                                offs_schar, DW_OP_stack_value}),
-                       llvm::HasValue(GetScalar(8, 'A', is_signed)));
+                       ExpectScalar(8, 'A', is_signed));
 
   //
   // Errors.
@@ -479,33 +450,21 @@ TEST(DWARFExpression, DW_OP_stack_value) {
 TEST(DWARFExpression, DW_OP_piece) {
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const2u, 0x11, 0x22, DW_OP_piece, 2,
                                  DW_OP_const2u, 0x33, 0x44, DW_OP_piece, 2}),
-                       llvm::HasValue(GetScalar(32, 0x44332211, true)));
+                       ExpectHostAddress({0x11, 0x22, 0x33, 0x44}));
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_piece, 1, DW_OP_const1u, 0xff, DW_OP_piece, 1}),
       // Note that the "00" should really be "undef", but we can't
       // represent that yet.
-      llvm::HasValue(GetScalar(16, 0xff00, true)));
-}
-
-TEST(DWARFExpression, DW_OP_piece_host_address) {
-  static const uint8_t expr_data[] = {DW_OP_lit2, DW_OP_stack_value,
-                                      DW_OP_piece, 40};
-  llvm::ArrayRef<uint8_t> expr(expr_data, sizeof(expr_data));
-  DataExtractor extractor(expr.data(), expr.size(), lldb::eByteOrderLittle, 4);
+      ExpectHostAddress({0x00, 0xff}));
 
   // This tests if ap_int is extended to the right width.
   // expect 40*8 = 320 bits size.
-  llvm::Expected<Value> result =
-      DWARFExpression::Evaluate(nullptr, nullptr, nullptr, extractor, nullptr,
-                                lldb::eRegisterKindDWARF, nullptr, nullptr);
-  ASSERT_THAT_EXPECTED(result, llvm::Succeeded());
-  ASSERT_EQ(result->GetValueType(), Value::ValueType::HostAddress);
-  ASSERT_EQ(result->GetBuffer().GetByteSize(), 40ul);
-  const uint8_t *data = result->GetBuffer().GetBytes();
-  ASSERT_EQ(data[0], 2);
-  for (int i = 1; i < 40; i++) {
-    ASSERT_EQ(data[i], 0);
-  }
+  std::vector<uint8_t> expected_host_buffer(40, 0);
+  expected_host_buffer[0] = 2;
+
+  EXPECT_THAT_EXPECTED(
+      Evaluate({{DW_OP_lit2, DW_OP_stack_value, DW_OP_piece, 40}}),
+      ExpectHostAddress(expected_host_buffer));
 }
 
 TEST(DWARFExpression, DW_OP_implicit_value) {
@@ -513,7 +472,7 @@ TEST(DWARFExpression, DW_OP_implicit_value) {
 
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_implicit_value, bytes, 0x11, 0x22, 0x33, 0x44}),
-      llvm::HasValue(GetScalar(8 * bytes, 0x44332211, true)));
+      ExpectHostAddress({0x11, 0x22, 0x33, 0x44}));
 }
 
 TEST(DWARFExpression, DW_OP_unknown) {
@@ -548,20 +507,13 @@ TEST_F(DWARFExpressionMockProcessTest, DW_OP_deref) {
   // Implicit location: *0x4.
   EXPECT_THAT_EXPECTED(
       Evaluate({DW_OP_lit4, DW_OP_deref, DW_OP_stack_value}, {}, {}, &exe_ctx),
-      llvm::HasValue(GetScalar(32, 0x07060504, false)));
+      ExpectScalar(32, 0x07060504, false));
   // Memory location: *(*0x4).
-  // Evaluate returns LLDB_INVALID_ADDRESS for all load addresses.
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit4, DW_OP_deref}, {}, {}, &exe_ctx),
-                       llvm::HasValue(Scalar(LLDB_INVALID_ADDRESS)));
+                       ExpectLoadAddress(0x07060504));
   // Memory location: *0x4.
-  // Evaluate returns LLDB_INVALID_ADDRESS for all load addresses.
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit4}, {}, {}, &exe_ctx),
-                       llvm::HasValue(Scalar(4)));
-  // Implicit location: *0x4.
-  // Evaluate returns LLDB_INVALID_ADDRESS for all load addresses.
-  EXPECT_THAT_EXPECTED(
-      Evaluate({DW_OP_lit4, DW_OP_deref, DW_OP_stack_value}, {}, {}, &exe_ctx),
-      llvm::HasValue(GetScalar(32, 0x07060504, false)));
+                       ExpectScalar(Scalar(4)));
 }
 
 TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr) {
@@ -581,18 +533,9 @@ TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr) {
 
   ExecutionContext exe_ctx(target_sp, false);
   // DW_OP_addr takes a single operand of address size width:
-  uint8_t expr[] = {DW_OP_addr, 0x40, 0x0, 0x0, 0x0};
-  DataExtractor extractor(expr, sizeof(expr), lldb::eByteOrderLittle,
-                          /*addr_size*/ 4);
-
-  llvm::Expected<Value> result = DWARFExpression::Evaluate(
-      &exe_ctx, /*reg_ctx*/ nullptr, /*module_sp*/ {}, extractor,
-      /*unit*/ nullptr, lldb::eRegisterKindLLDB,
-      /*initial_value_ptr*/ nullptr,
-      /*object_address_ptr*/ nullptr);
-
-  ASSERT_THAT_EXPECTED(result, llvm::Succeeded());
-  ASSERT_EQ(result->GetValueType(), Value::ValueType::LoadAddress);
+  EXPECT_THAT_EXPECTED(
+      Evaluate({DW_OP_addr, 0x40, 0x0, 0x0, 0x0}, {}, {}, &exe_ctx),
+      ExpectLoadAddress(0x40));
 }
 
 TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr_index) {
@@ -676,15 +619,11 @@ TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr_index) {
   DWARFExpression expr(extractor);
 
   llvm::Expected<Value> result = evaluate(expr);
-  ASSERT_THAT_EXPECTED(result, llvm::Succeeded());
-  ASSERT_EQ(result->GetValueType(), Value::ValueType::LoadAddress);
-  ASSERT_EQ(result->GetScalar().UInt(), 0x5678u);
+  EXPECT_THAT_EXPECTED(result, ExpectLoadAddress(0x5678u));
 
   ASSERT_TRUE(expr.Update_DW_OP_addr(dwarf_cu, 0xdeadbeef));
   result = evaluate(expr);
-  ASSERT_THAT_EXPECTED(result, llvm::Succeeded());
-  ASSERT_EQ(result->GetValueType(), Value::ValueType::LoadAddress);
-  ASSERT_EQ(result->GetScalar().UInt(), 0xdeadbeefu);
+  EXPECT_THAT_EXPECTED(result, ExpectLoadAddress(0xdeadbeefu));
 }
 
 class CustomSymbolFileDWARF : public SymbolFileDWARF {
@@ -778,11 +717,12 @@ static auto testExpressionVendorExtensions(lldb::ModuleSP module_sp,
                                            RegisterContext *reg_ctx) {
   // Test that expression extensions can be evaluated, for example
   // DW_OP_WASM_location which is not currently handled by DWARFExpression:
-  EXPECT_THAT_EXPECTED(Evaluate({DW_OP_WASM_location, 0x03, // WASM_GLOBAL:0x03
-                                 0x04, 0x00, 0x00,          // index:u32
-                                 0x00, DW_OP_stack_value},
-                                module_sp, &dwarf_unit, nullptr, reg_ctx),
-                       llvm::HasValue(GetScalar(32, 42, false)));
+  EXPECT_THAT_EXPECTED(
+      Evaluate({DW_OP_WASM_location, 0x03, // WASM_GLOBAL:0x03
+                0x04, 0x00, 0x00,          // index:u32
+                0x00, DW_OP_stack_value},
+               module_sp, &dwarf_unit, nullptr, reg_ctx),
+      ExpectScalar(32, 42, false, Value::ContextType::RegisterInfo));
 
   // Test that searches for opcodes work in the presence of extensions:
   uint8_t expr[] = {DW_OP_WASM_location,   0x03, 0x04, 0x00, 0x00, 0x00,
@@ -1148,17 +1088,8 @@ TEST_F(DWARFExpressionMockProcessTest, DW_OP_piece_file_addr) {
 
   uint8_t expr[] = {DW_OP_addr, 0x40, 0x0, 0x0, 0x0, DW_OP_piece, 1,
                     DW_OP_addr, 0x50, 0x0, 0x0, 0x0, DW_OP_piece, 1};
-  DataExtractor extractor(expr, sizeof(expr), lldb::eByteOrderLittle,
-                          /*addr_size=*/4);
-  llvm::Expected<Value> result = DWARFExpression::Evaluate(
-      &exe_ctx, /*reg_ctx=*/nullptr, /*module_sp=*/{}, extractor,
-      /*unit=*/nullptr, lldb::eRegisterKindLLDB,
-      /*initial_value_ptr=*/nullptr,
-      /*object_address_ptr=*/nullptr);
-
-  ASSERT_THAT_EXPECTED(result, llvm::Succeeded());
-  ASSERT_EQ(result->GetValueType(), Value::ValueType::HostAddress);
-  ASSERT_THAT(result->GetBuffer().GetData(), ElementsAre(0x11, 0x22));
+  EXPECT_THAT_EXPECTED(Evaluate(expr, {}, {}, &exe_ctx),
+                       ExpectHostAddress({0x11, 0x22}));
 }
 
 /// A Process whose `ReadMemory` override queries a DenseMap.
@@ -1228,28 +1159,15 @@ TEST_F(DWARFExpressionMockProcessTestWithAArch, DW_op_deref_no_ptr_fixing) {
   process_sp->GetThreadList().AddThread(thread);
 
   auto evaluate_expr = [&](auto &expr_data) {
-    DataExtractor extractor(expr_data, sizeof(expr_data),
-                            lldb::eByteOrderLittle,
-                            /*addr_size*/ 8);
-    DWARFExpression expr(extractor);
-
     ExecutionContext exe_ctx(process_sp);
-    llvm::Expected<Value> result = DWARFExpression::Evaluate(
-        &exe_ctx, reg_ctx_sp.get(), /*module_sp*/ nullptr, extractor,
-        /*unit*/ nullptr, lldb::eRegisterKindLLDB,
-        /*initial_value_ptr=*/nullptr,
-        /*object_address_ptr=*/nullptr);
-    return result;
+    return Evaluate(expr_data, {}, {}, &exe_ctx, reg_ctx_sp.get());
   };
 
   uint8_t expr_reg[] = {DW_OP_breg22, 0};
   llvm::Expected<Value> result_reg = evaluate_expr(expr_reg);
-  ASSERT_THAT_EXPECTED(result_reg, llvm::Succeeded());
-  ASSERT_EQ(result_reg->GetValueType(), Value::ValueType::LoadAddress);
-  ASSERT_EQ(result_reg->GetScalar().ULongLong(), addr);
+  EXPECT_THAT_EXPECTED(result_reg, ExpectLoadAddress(addr));
 
   uint8_t expr_deref[] = {DW_OP_breg22, 0, DW_OP_deref};
   llvm::Expected<Value> result_deref = evaluate_expr(expr_deref);
-  ASSERT_THAT_EXPECTED(result_deref, llvm::Succeeded());
-  ASSERT_EQ(result_deref->GetScalar().ULongLong(), expected_value);
+  EXPECT_THAT_EXPECTED(result_deref, ExpectLoadAddress(expected_value));
 }
diff --git a/lldb/unittests/Expression/ValueMatcher.cpp b/lldb/unittests/Expression/ValueMatcher.cpp
new file mode 100644
index 0000000000000..ee7ccaebabd64
--- /dev/null
+++ b/lldb/unittests/Expression/ValueMatcher.cpp
@@ -0,0 +1,205 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueMatcher.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/InterleavedRange.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace lldb_private;
+
+static void FormatValueDetails(llvm::raw_ostream &os,
+                               Value::ValueType value_type,
+                               Value::ContextType context_type,
+                               const Scalar &scalar,
+                               llvm::ArrayRef<uint8_t> buffer_data) {
+  os << "Value(";
+  os << "value_type=" << Value::GetValueTypeAsCString(value_type);
+  os << ", context_type=" << Value::GetContextTypeAsCString(context_type);
+
+  if (value_type == Value::ValueType::HostAddress) {
+    auto bytes_to_print = buffer_data.take_front(16);
+    os << ", buffer=[";
+    llvm::interleave(
+        bytes_to_print,
+        [&](uint8_t byte) {
+          os << llvm::format("%02x", static_cast<unsigned>(byte));
+        },
+        [&]() { os << " "; });
+    if (buffer_data.size() > 16)
+      os << " ...";
+    os << "] (" << buffer_data.size() << " bytes)";
+  } else {
+    os << ", value=" << scalar;
+  }
+  os << ")";
+}
+
+void lldb_private::PrintTo(const Value &val, std::ostream *os) {
+  if (!os)
+    return;
+
+  llvm::raw_os_ostream raw_os(*os);
+  FormatValueDetails(raw_os, val.GetValueType(), val.GetContextType(),
+                     val.GetScalar(), val.GetBuffer().GetData());
+}
+
+bool ValueMatcher::MatchAndExplain(const Value &val,
+                                   std::ostream *stream) const {
+  if (stream) {
+    llvm::raw_os_ostream os(*stream);
+    return MatchAndExplainImpl(val, os);
+  }
+
+  llvm::raw_null_ostream os;
+  return MatchAndExplainImpl(val, os);
+}
+
+// Match the provided value and explain any mismatches using
+// the raw_ostream. We use the llvm::raw_ostream here to simplify the formatting
+// of Scalar values which already know how to print themselves to that stream.
+bool ValueMatcher::MatchAndExplainImpl(const Value &val,
+                                       llvm::raw_ostream &os) const {
+  if (val.GetValueType() != m_value_type) {
+    os << "value_type mismatch: expected "
+       << Value::GetValueTypeAsCString(m_value_type) << ", got "
+       << Value::GetValueTypeAsCString(val.GetValueType()) << " ";
+    return false;
+  }
+
+  if (val.GetContextType() != m_context_type) {
+    os << "context_type mismatch: expected "
+       << Value::GetContextTypeAsCString(m_context_type) << ", got "
+       << Value::GetContextTypeAsCString(val.GetContextType()) << " ";
+    return false;
+  }
+
+  if (m_value_type == Value::ValueType::HostAddress) {
+    const DataBufferHeap &buffer = val.GetBuffer();
+    const size_t buffer_size = buffer.GetByteSize();
+    if (buffer_size != m_expected_bytes.size()) {
+      os << "buffer size mismatch: expected " << m_expected_bytes.size()
+         << ", got " << buffer_size << " ";
+      return false;
+    }
+
+    const uint8_t *data = buffer.GetBytes();
+    for (size_t i = 0; i < buffer_size; ++i) {
+      if (data[i] != m_expected_bytes[i]) {
+        os << "byte mismatch at index " << i << ": expected "
+           << llvm::format("0x%02x", static_cast<unsigned>(m_expected_bytes[i]))
+           << ", got " << llvm::format("0x%02x", static_cast<unsigned>(data[i]))
+           << " ";
+        return false;
+      }
+    }
+  } else {
+    // For Scalar, FileAddress, and LoadAddress compare m_value.
+    const Scalar &actual_scalar = val.GetScalar();
+    if (actual_scalar != m_expected_scalar) {
+      os << "scalar value mismatch: expected " << m_expected_scalar << ", got "
+         << actual_scalar;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void ValueMatcher::DescribeTo(std::ostream *os) const {
+  if (!os)
+    return;
+  llvm::raw_os_ostream raw_os(*os);
+  FormatValueDetails(raw_os, m_value_type, m_context_type, m_expected_scalar,
+                     m_expected_bytes);
+}
+
+void ValueMatcher::DescribeNegationTo(std::ostream *os) const {
+  if (!os)
+    return;
+  *os << "value does not match";
+}
+
+testing::Matcher<Value>
+lldb_private::MatchScalarValue(Value::ValueType value_type,
+                               const Scalar &expected_scalar,
+                               Value::ContextType context_type) {
+  return ValueMatcher(value_type, expected_scalar, context_type);
+}
+
+testing::Matcher<Value>
+lldb_private::MatchHostValue(Value::ValueType value_type,
+                             const std::vector<uint8_t> &expected_bytes,
+                             Value::ContextType context_type) {
+  return ValueMatcher(value_type, expected_bytes, context_type);
+}
+
+testing::Matcher<Value>
+lldb_private::IsScalar(const Scalar &expected_scalar,
+                       Value::ContextType context_type) {
+  return MatchScalarValue(Value::ValueType::Scalar, expected_scalar,
+                          context_type);
+}
+
+testing::Matcher<Value>
+lldb_private::IsLoadAddress(const Scalar &expected_address,
+                            Value::ContextType context_type) {
+  return MatchScalarValue(Value::ValueType::LoadAddress, expected_address,
+                          context_type);
+}
+
+testing::Matcher<Value>
+lldb_private::IsFileAddress(const Scalar &expected_address,
+                            Value::ContextType context_type) {
+  return MatchScalarValue(Value::ValueType::FileAddress, expected_address,
+                          context_type);
+}
+
+testing::Matcher<Value>
+lldb_private::IsHostValue(const std::vector<uint8_t> &expected_bytes,
+                          Value::ContextType context_type) {
+  return MatchHostValue(Value::ValueType::HostAddress, expected_bytes,
+                        context_type);
+}
+
+Scalar lldb_private::GetScalar(unsigned bits, uint64_t value, bool sign) {
+  Scalar scalar(value);
+  scalar.TruncOrExtendTo(bits, sign);
+  return scalar;
+}
+
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+lldb_private::ExpectScalar(const Scalar &expected_scalar,
+                           Value::ContextType context_type) {
+  return llvm::HasValue(IsScalar(expected_scalar, context_type));
+}
+
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+lldb_private::ExpectScalar(unsigned bits, uint64_t value, bool sign,
+                           Value::ContextType context_type) {
+  return ExpectScalar(GetScalar(bits, value, sign), context_type);
+}
+
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+lldb_private::ExpectLoadAddress(const Scalar &expected_address,
+                                Value::ContextType context_type) {
+  return llvm::HasValue(IsLoadAddress(expected_address, context_type));
+}
+
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+lldb_private::ExpectFileAddress(const Scalar &expected_address,
+                                Value::ContextType context_type) {
+  return llvm::HasValue(IsFileAddress(expected_address, context_type));
+}
+
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+lldb_private::ExpectHostAddress(const std::vector<uint8_t> &expected_bytes,
+                                Value::ContextType context_type) {
+  return llvm::HasValue(IsHostValue(expected_bytes, context_type));
+}
diff --git a/lldb/unittests/Expression/ValueMatcher.h b/lldb/unittests/Expression/ValueMatcher.h
new file mode 100644
index 0000000000000..3ca7b15e1d3c8
--- /dev/null
+++ b/lldb/unittests/Expression/ValueMatcher.h
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains the definition of the ValueMatcher class which is a used
+/// to match lldb_private::Value in gtest assert/expect macros. It also contains
+/// several helper functions to create matchers for common Value types.
+///
+/// The ValueMatcher class was created using the gtest guide found here:
+//  https://google.github.io/googletest/gmock_cook_book.html#writing-new-monomorphic-matchers
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_UNITTESTS_EXPRESSION_VALUEMATCHER_H
+#define LLDB_UNITTESTS_EXPRESSION_VALUEMATCHER_H
+
+#include "lldb/Core/Value.h"
+#include "lldb/Utility/Scalar.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <cstdint>
+#include <vector>
+
+namespace lldb_private {
+
+/// Custom printer for Value objects to make test failures more readable.
+void PrintTo(const Value &val, std::ostream *os);
+
+/// Custom matcher for Value.
+///
+/// It matches against an expected value_type, and context_type.
+/// For HostAddress value types it will match the expected contents of
+/// the host buffer. For other value types it matches against an expected
+/// scalar value.
+class ValueMatcher {
+public:
+  ValueMatcher(Value::ValueType value_type, const Scalar &expected_scalar,
+               Value::ContextType context_type)
+      : m_value_type(value_type), m_context_type(context_type),
+        m_expected_scalar(expected_scalar) {
+    assert(value_type == Value::ValueType::Scalar ||
+           value_type == Value::ValueType::FileAddress ||
+           value_type == Value::ValueType::LoadAddress);
+  }
+
+  ValueMatcher(Value::ValueType value_type,
+               const std::vector<uint8_t> &expected_bytes,
+               Value::ContextType context_type)
+      : m_value_type(value_type), m_context_type(context_type),
+        m_expected_bytes(expected_bytes) {
+    assert(value_type == Value::ValueType::HostAddress);
+  }
+
+  // Typedef to hook into the gtest matcher machinery.
+  using is_gtest_matcher = void;
+
+  bool MatchAndExplain(const Value &val, std::ostream *os) const;
+
+  void DescribeTo(std::ostream *os) const;
+
+  void DescribeNegationTo(std::ostream *os) const;
+
+private:
+  Value::ValueType m_value_type = Value::ValueType::Invalid;
+  Value::ContextType m_context_type = Value::ContextType::Invalid;
+  Scalar m_expected_scalar;
+  std::vector<uint8_t> m_expected_bytes;
+
+  bool MatchAndExplainImpl(const Value &val, llvm::raw_ostream &os) const;
+};
+
+/// Matcher for Value with Scalar, FileAddress, or LoadAddress types.
+/// Use with llvm::HasValue() to match Expected<Value>:
+/// EXPECT_THAT_EXPECTED(result, llvm::HasValue(MatchScalarValue(...)));
+testing::Matcher<Value> MatchScalarValue(Value::ValueType value_type,
+                                         const Scalar &expected_scalar,
+                                         Value::ContextType context_type);
+
+/// Matcher for Value with HostAddress type.
+/// Use with llvm::HasValue() to match Expected<Value>:
+/// EXPECT_THAT_EXPECTED(result, llvm::HasValue(MatchHostValue(...)));
+testing::Matcher<Value>
+MatchHostValue(Value::ValueType value_type,
+               const std::vector<uint8_t> &expected_bytes,
+               Value::ContextType context_type);
+
+/// Helper to match a Scalar value and context type.
+/// Use with llvm::HasValue() to match Expected<Value>:
+/// EXPECT_THAT_EXPECTED(result, llvm::HasValue(IsScalar(42)));
+testing::Matcher<Value> IsScalar(const Scalar &expected_scalar,
+                                 Value::ContextType context_type);
+
+/// Helper to match a LoadAddress value and context type.
+/// Use with llvm::HasValue() to match Expected<Value>:
+/// EXPECT_THAT_EXPECTED(result, llvm::HasValue(IsLoadAddress(0x1000)));
+testing::Matcher<Value> IsLoadAddress(const Scalar &expected_address,
+                                      Value::ContextType context_type);
+
+/// Helper to match a FileAddress value and context type.
+/// Use with llvm::HasValue() to match Expected<Value>:
+/// EXPECT_THAT_EXPECTED(result, llvm::HasValue(IsFileAddress(Scalar(0x1000))));
+testing::Matcher<Value> IsFileAddress(const Scalar &expected_address,
+                                      Value::ContextType context_type);
+
+/// Helper to match a HostAddress value and context type.
+/// Use with llvm::HasValue() to match Expected<Value>:
+/// EXPECT_THAT_EXPECTED(result, llvm::HasValue(IsHostValue({0x11, 0x22})));
+testing::Matcher<Value> IsHostValue(const std::vector<uint8_t> &expected_bytes,
+                                    Value::ContextType context_type);
+
+/// Helper to create a scalar because Scalar's operator==() is really picky.
+Scalar GetScalar(unsigned bits, uint64_t value, bool sign);
+
+/// Helper that combines IsScalar with llvm::HasValue for Expected<Value>.
+/// Use it on an Expected<Value> like this:
+/// EXPECT_THAT_EXPECTED(result, ExpectScalar(42));
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+ExpectScalar(const Scalar &expected_scalar,
+             Value::ContextType context_type = Value::ContextType::Invalid);
+
+/// Helper that combines GetScalar with ExpectScalar to get a precise scalar.
+/// Use it on an Expected<Value> like this:
+/// EXPECT_THAT_EXPECTED(result, ExpectScalar(8, 42, true));
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>>
+ExpectScalar(unsigned bits, uint64_t value, bool sign,
+             Value::ContextType context_type = Value::ContextType::Invalid);
+
+/// Helper that combines IsLoadAddress with llvm::HasValue for Expected<Value>.
+/// Use it on an Expected<Value> like this:
+/// EXPECT_THAT_EXPECTED(result, ExpectLoadAddress(0x1000));
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>> ExpectLoadAddress(
+    const Scalar &expected_address,
+    Value::ContextType context_type = Value::ContextType::Invalid);
+
+/// Helper that combines IsFileAddress with llvm::HasValue for Expected<Value>.
+/// Use it on an Expected<Value> like this:
+/// EXPECT_THAT_EXPECTED(result, ExpectFileAddress(Scalar(0x2000)));
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>> ExpectFileAddress(
+    const Scalar &expected_address,
+    Value::ContextType context_type = Value::ContextType::Invalid);
+
+/// Helper that combines IsHostValue with llvm::HasValue for Expected<Value>.
+/// Use it on an Expected<Value> like this:
+/// EXPECT_THAT_EXPECTED(result, ExpectHostAddress({0x11, 0x22}));
+llvm::detail::ValueMatchesPoly<testing::Matcher<Value>> ExpectHostAddress(
+    const std::vector<uint8_t> &expected_bytes,
+    Value::ContextType context_type = Value::ContextType::Invalid);
+
+} // namespace lldb_private
+
+#endif // LLDB_UNITTESTS_EXPRESSION_VALUEMATCHER_H

From ea16f7d9dbade0c138e4231aa12c92d91bc15b21 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 13 Nov 2025 08:43:10 -0800
Subject: [PATCH 13/25] [libcxx] Fix xsgetn in basic_filebuf (#167779)

The optimized version of xsgetn for basic_filebuf added in #165223 has
an issue where if the reads come from both the buffer and the
filesystem it returns the wrong number of characters. This patch should
address the issue.
---
 libcxx/include/fstream | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index b07ca636094af..90e35740c17cf 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -315,8 +315,14 @@ protected:
         traits_type::copy(__str, this->gptr(), __n);
         this->__gbump_ptrdiff(__n);
       }
-      if (__len - __n >= this->egptr() - this->eback())
-        return std::fread(__str + __n, sizeof(char_type), __len - __n, __file_);
+      const streamsize __remainder    = __len - __n;
+      const streamsize __buffer_space = this->egptr() - this->eback();
+
+      if (__remainder >= __buffer_space)
+        return std::fread(__str + __n, sizeof(char_type), __remainder, __file_) + __n;
+      else if (__remainder > 0)
+        return basic_streambuf<_CharT, _Traits>::xsgetn(__str + __n, __remainder) + __n;
+      return __n;
     }
     return basic_streambuf<_CharT, _Traits>::xsgetn(__str, __len);
   }

From e63a47de91b566b3826604608ca80bac2a27c5a9 Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813@gmail.com>
Date: Fri, 14 Nov 2025 00:48:27 +0800
Subject: [PATCH 14/25] [RISCV][llvm] Handle INSERT_VECTOR_ELT,
 EXTRACT_VECTOR_ELT codegen for zvfbfa (#167819)

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  |  19 +-
 llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll | 263 +++++++
 llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll  | 700 +++++++++++++++----
 llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll     |  30 +-
 4 files changed, 852 insertions(+), 160 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 28fe76bb35b0c..6306c6db37083 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1264,11 +1264,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          Custom);
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, Custom);
-      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS,
-                          ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR,
-                          ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE,
-                          ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE,
-                          ISD::VECTOR_COMPRESS},
+      setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
+                          ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
+                          ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_DEINTERLEAVE,
+                          ISD::VECTOR_INTERLEAVE, ISD::VECTOR_REVERSE,
+                          ISD::VECTOR_SPLICE, ISD::VECTOR_COMPRESS},
                          VT, Custom);
       setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
       setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
@@ -1278,9 +1278,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
       MVT EltVT = VT.getVectorElementType();
       if (isTypeLegal(EltVT))
-        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT,
-                            ISD::EXTRACT_VECTOR_ELT},
-                           VT, Custom);
+        setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT}, VT,
+                           Custom);
       else
         setOperationAction({ISD::SPLAT_VECTOR, ISD::EXPERIMENTAL_VP_SPLAT},
                            EltVT, Custom);
@@ -10356,7 +10355,7 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   }
 
   if ((ValVT == MVT::f16 && !Subtarget.hasVInstructionsF16()) ||
-      ValVT == MVT::bf16) {
+      (ValVT == MVT::bf16 && !Subtarget.hasVInstructionsBF16())) {
     // If we don't have vfmv.s.f for f16/bf16, use fmv.x.h first.
     MVT IntVT = VecVT.changeTypeToInteger();
     SDValue IntInsert = DAG.getNode(
@@ -10593,7 +10592,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   }
 
   if ((EltVT == MVT::f16 && !Subtarget.hasVInstructionsF16()) ||
-      EltVT == MVT::bf16) {
+      (EltVT == MVT::bf16 && !Subtarget.hasVInstructionsBF16())) {
     // If we don't have vfmv.f.s for f16/bf16, extract to a gpr then use fmv.h.x
     MVT IntVT = VecVT.changeTypeToInteger();
     SDValue IntVec = DAG.getBitcast(IntVT, Vec);
diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 692a7ce0b20e8..903c0dcaba2d8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,NOZFMIN,ZVFHMIN
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZFMIN
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zfbfmin,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZFMIN
+; RUN: llc -mtriple=riscv32 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+v,+d,+zfhmin,+zvfhmin,+experimental-zvfbfa -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,ZVFBFA
 
 define bfloat @extractelt_nxv1bf16_0(<vscale x 1 x bfloat> %v) {
 ; NOZFMIN-LABEL: extractelt_nxv1bf16_0:
@@ -22,6 +24,12 @@ define bfloat @extractelt_nxv1bf16_0(<vscale x 1 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv1bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 1 x bfloat> %v, i32 0
   ret bfloat %r
 }
@@ -44,6 +52,13 @@ define bfloat @extractelt_nxv1bf16_imm(<vscale x 1 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv1bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 1 x bfloat> %v, i32 2
   ret bfloat %r
 }
@@ -66,6 +81,13 @@ define bfloat @extractelt_nxv1bf16_idx(<vscale x 1 x bfloat> %v, i32 zeroext %id
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv1bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 1 x bfloat> %v, i32 %idx
   ret bfloat %r
 }
@@ -86,6 +108,12 @@ define bfloat @extractelt_nxv2bf16_0(<vscale x 2 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv2bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 2 x bfloat> %v, i32 0
   ret bfloat %r
 }
@@ -108,6 +136,13 @@ define bfloat @extractelt_nxv2bf16_imm(<vscale x 2 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv2bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 2 x bfloat> %v, i32 2
   ret bfloat %r
 }
@@ -130,6 +165,13 @@ define bfloat @extractelt_nxv2bf16_idx(<vscale x 2 x bfloat> %v, i32 zeroext %id
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv2bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 2 x bfloat> %v, i32 %idx
   ret bfloat %r
 }
@@ -150,6 +192,12 @@ define bfloat @extractelt_nxv4bf16_0(<vscale x 4 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv4bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 4 x bfloat> %v, i32 0
   ret bfloat %r
 }
@@ -172,6 +220,13 @@ define bfloat @extractelt_nxv4bf16_imm(<vscale x 4 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv4bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 4 x bfloat> %v, i32 2
   ret bfloat %r
 }
@@ -194,6 +249,13 @@ define bfloat @extractelt_nxv4bf16_idx(<vscale x 4 x bfloat> %v, i32 zeroext %id
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv4bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 4 x bfloat> %v, i32 %idx
   ret bfloat %r
 }
@@ -214,6 +276,12 @@ define bfloat @extractelt_nxv8bf16_0(<vscale x 8 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv8bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 8 x bfloat> %v, i32 0
   ret bfloat %r
 }
@@ -236,6 +304,13 @@ define bfloat @extractelt_nxv8bf16_imm(<vscale x 8 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv8bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 8 x bfloat> %v, i32 2
   ret bfloat %r
 }
@@ -258,6 +333,14 @@ define bfloat @extractelt_nxv8bf16_idx(<vscale x 8 x bfloat> %v, i32 zeroext %id
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv8bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 8 x bfloat> %v, i32 %idx
   ret bfloat %r
 }
@@ -278,6 +361,12 @@ define bfloat @extractelt_nxv16bf16_0(<vscale x 16 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv16bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 16 x bfloat> %v, i32 0
   ret bfloat %r
 }
@@ -300,6 +389,13 @@ define bfloat @extractelt_nxv16bf16_imm(<vscale x 16 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv16bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 16 x bfloat> %v, i32 2
   ret bfloat %r
 }
@@ -322,6 +418,14 @@ define bfloat @extractelt_nxv16bf16_idx(<vscale x 16 x bfloat> %v, i32 zeroext %
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv16bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 16 x bfloat> %v, i32 %idx
   ret bfloat %r
 }
@@ -342,6 +446,12 @@ define bfloat @extractelt_nxv32bf16_0(<vscale x 32 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv32bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 32 x bfloat> %v, i32 0
   ret bfloat %r
 }
@@ -364,6 +474,13 @@ define bfloat @extractelt_nxv32bf16_imm(<vscale x 32 x bfloat> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv32bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 32 x bfloat> %v, i32 2
   ret bfloat %r
 }
@@ -386,6 +503,14 @@ define bfloat @extractelt_nxv32bf16_idx(<vscale x 32 x bfloat> %v, i32 zeroext %
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv32bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vfmv.f.s fa0, v8
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 32 x bfloat> %v, i32 %idx
   ret bfloat %r
 }
@@ -412,6 +537,13 @@ define half @extractelt_nxv1f16_0(<vscale x 1 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv1f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 0
   ret half %r
 }
@@ -441,6 +573,14 @@ define half @extractelt_nxv1f16_imm(<vscale x 1 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv1f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 2
   ret half %r
 }
@@ -470,6 +610,14 @@ define half @extractelt_nxv1f16_idx(<vscale x 1 x half> %v, i32 zeroext %idx) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv1f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 1 x half> %v, i32 %idx
   ret half %r
 }
@@ -496,6 +644,13 @@ define half @extractelt_nxv2f16_0(<vscale x 2 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv2f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 0
   ret half %r
 }
@@ -525,6 +680,14 @@ define half @extractelt_nxv2f16_imm(<vscale x 2 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv2f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 2
   ret half %r
 }
@@ -554,6 +717,14 @@ define half @extractelt_nxv2f16_idx(<vscale x 2 x half> %v, i32 zeroext %idx) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv2f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 2 x half> %v, i32 %idx
   ret half %r
 }
@@ -580,6 +751,13 @@ define half @extractelt_nxv4f16_0(<vscale x 4 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv4f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 4 x half> %v, i32 0
   ret half %r
 }
@@ -609,6 +787,14 @@ define half @extractelt_nxv4f16_imm(<vscale x 4 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv4f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 4 x half> %v, i32 2
   ret half %r
 }
@@ -638,6 +824,14 @@ define half @extractelt_nxv4f16_idx(<vscale x 4 x half> %v, i32 zeroext %idx) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv4f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 4 x half> %v, i32 %idx
   ret half %r
 }
@@ -664,6 +858,13 @@ define half @extractelt_nxv8f16_0(<vscale x 8 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv8f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 8 x half> %v, i32 0
   ret half %r
 }
@@ -693,6 +894,14 @@ define half @extractelt_nxv8f16_imm(<vscale x 8 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv8f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 8 x half> %v, i32 2
   ret half %r
 }
@@ -722,6 +931,14 @@ define half @extractelt_nxv8f16_idx(<vscale x 8 x half> %v, i32 zeroext %idx) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv8f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 8 x half> %v, i32 %idx
   ret half %r
 }
@@ -748,6 +965,13 @@ define half @extractelt_nxv16f16_0(<vscale x 16 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv16f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 16 x half> %v, i32 0
   ret half %r
 }
@@ -777,6 +1001,14 @@ define half @extractelt_nxv16f16_imm(<vscale x 16 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv16f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 16 x half> %v, i32 2
   ret half %r
 }
@@ -806,6 +1038,14 @@ define half @extractelt_nxv16f16_idx(<vscale x 16 x half> %v, i32 zeroext %idx)
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv16f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 16 x half> %v, i32 %idx
   ret half %r
 }
@@ -832,6 +1072,13 @@ define half @extractelt_nxv32f16_0(<vscale x 32 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv32f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 32 x half> %v, i32 0
   ret half %r
 }
@@ -861,6 +1108,14 @@ define half @extractelt_nxv32f16_imm(<vscale x 32 x half> %v) {
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv32f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vi v8, v8, 2
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 32 x half> %v, i32 2
   ret half %r
 }
@@ -890,6 +1145,14 @@ define half @extractelt_nxv32f16_idx(<vscale x 32 x half> %v, i32 zeroext %idx)
 ; ZFMIN-NEXT:    vmv.x.s a0, v8
 ; ZFMIN-NEXT:    fmv.h.x fa0, a0
 ; ZFMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: extractelt_nxv32f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v8, v8, a0
+; ZVFBFA-NEXT:    vmv.x.s a0, v8
+; ZVFBFA-NEXT:    fmv.h.x fa0, a0
+; ZVFBFA-NEXT:    ret
   %r = extractelement <vscale x 32 x half> %v, i32 %idx
   ret half %r
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
index 607e0085c3f46..7c6e0cea706d7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-fp.ll
@@ -7,225 +7,511 @@
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zfbfmin,+zvfhmin,+zvfbfmin,+v -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+experimental-zvfbfa,+v -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+experimental-zvfbfa,+v -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @insertelt_nxv1bf16_0(<vscale x 1 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv1bf16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv1bf16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv1bf16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv1bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 1 x bfloat> %v, bfloat %elt, i32 0
   ret <vscale x 1 x bfloat> %r
 }
 
 define <vscale x 1 x bfloat> @insertelt_nxv1bf16_imm(<vscale x 1 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv1bf16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf4, tu, ma
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vslideup.vi v8, v9, 3
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv1bf16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf4, tu, ma
+; ZVFH-NEXT:    vmv.s.x v9, a0
+; ZVFH-NEXT:    vslideup.vi v8, v9, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv1bf16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf4, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv1bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16alt, mf4, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v9, fa0
+; ZVFBFA-NEXT:    vslideup.vi v8, v9, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 1 x bfloat> %v, bfloat %elt, i32 3
   ret <vscale x 1 x bfloat> %r
 }
 
 define <vscale x 1 x bfloat> @insertelt_nxv1bf16_idx(<vscale x 1 x bfloat> %v, bfloat %elt, i32 zeroext %idx) {
-; CHECK-LABEL: insertelt_nxv1bf16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    fmv.x.h a2, fa0
-; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv1bf16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    fmv.x.h a2, fa0
+; ZVFH-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.s.x v9, a2
+; ZVFH-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v9, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv1bf16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv1bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.s.f v9, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16alt, mf4, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v9, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 1 x bfloat> %v, bfloat %elt, i32 %idx
   ret <vscale x 1 x bfloat> %r
 }
 
 define <vscale x 2 x bfloat> @insertelt_nxv2bf16_0(<vscale x 2 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv2bf16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv2bf16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv2bf16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv2bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 2 x bfloat> %v, bfloat %elt, i32 0
   ret <vscale x 2 x bfloat> %r
 }
 
 define <vscale x 2 x bfloat> @insertelt_nxv2bf16_imm(<vscale x 2 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv2bf16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, tu, ma
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vslideup.vi v8, v9, 3
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv2bf16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, tu, ma
+; ZVFH-NEXT:    vmv.s.x v9, a0
+; ZVFH-NEXT:    vslideup.vi v8, v9, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv2bf16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv2bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16alt, mf2, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v9, fa0
+; ZVFBFA-NEXT:    vslideup.vi v8, v9, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 2 x bfloat> %v, bfloat %elt, i32 3
   ret <vscale x 2 x bfloat> %r
 }
 
 define <vscale x 2 x bfloat> @insertelt_nxv2bf16_idx(<vscale x 2 x bfloat> %v, bfloat %elt, i32 zeroext %idx) {
-; CHECK-LABEL: insertelt_nxv2bf16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    fmv.x.h a2, fa0
-; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv2bf16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    fmv.x.h a2, fa0
+; ZVFH-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.s.x v9, a2
+; ZVFH-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v9, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv2bf16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv2bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.s.f v9, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16alt, mf2, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v9, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 2 x bfloat> %v, bfloat %elt, i32 %idx
   ret <vscale x 2 x bfloat> %r
 }
 
 define <vscale x 4 x bfloat> @insertelt_nxv4bf16_0(<vscale x 4 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv4bf16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv4bf16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv4bf16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv4bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 4 x bfloat> %v, bfloat %elt, i32 0
   ret <vscale x 4 x bfloat> %r
 }
 
 define <vscale x 4 x bfloat> @insertelt_nxv4bf16_imm(<vscale x 4 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv4bf16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vslideup.vi v8, v9, 3
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv4bf16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v9, a0
+; ZVFH-NEXT:    vslideup.vi v8, v9, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv4bf16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv4bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v9, fa0
+; ZVFBFA-NEXT:    vslideup.vi v8, v9, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 4 x bfloat> %v, bfloat %elt, i32 3
   ret <vscale x 4 x bfloat> %r
 }
 
 define <vscale x 4 x bfloat> @insertelt_nxv4bf16_idx(<vscale x 4 x bfloat> %v, bfloat %elt, i32 zeroext %idx) {
-; CHECK-LABEL: insertelt_nxv4bf16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    fmv.x.h a2, fa0
-; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv4bf16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    fmv.x.h a2, fa0
+; ZVFH-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.s.x v9, a2
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v9, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv4bf16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v9, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv4bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.s.f v9, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v9, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 4 x bfloat> %v, bfloat %elt, i32 %idx
   ret <vscale x 4 x bfloat> %r
 }
 
 define <vscale x 8 x bfloat> @insertelt_nxv8bf16_0(<vscale x 8 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv8bf16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv8bf16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv8bf16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv8bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 8 x bfloat> %v, bfloat %elt, i32 0
   ret <vscale x 8 x bfloat> %r
 }
 
 define <vscale x 8 x bfloat> @insertelt_nxv8bf16_imm(<vscale x 8 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv8bf16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vslideup.vi v8, v10, 3
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv8bf16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v10, a0
+; ZVFH-NEXT:    vslideup.vi v8, v10, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv8bf16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v10, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v10, 3
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv8bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v10, fa0
+; ZVFBFA-NEXT:    vslideup.vi v8, v10, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 8 x bfloat> %v, bfloat %elt, i32 3
   ret <vscale x 8 x bfloat> %r
 }
 
 define <vscale x 8 x bfloat> @insertelt_nxv8bf16_idx(<vscale x 8 x bfloat> %v, bfloat %elt, i32 zeroext %idx) {
-; CHECK-LABEL: insertelt_nxv8bf16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv8bf16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.s.x v10, a1
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m2, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v10, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv8bf16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v10, a1
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m2, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v10, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv8bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.s.f v10, fa0
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16alt, m2, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v10, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 8 x bfloat> %v, bfloat %elt, i32 %idx
   ret <vscale x 8 x bfloat> %r
 }
 
 define <vscale x 16 x bfloat> @insertelt_nxv16bf16_0(<vscale x 16 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv16bf16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv16bf16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv16bf16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv16bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 16 x bfloat> %v, bfloat %elt, i32 0
   ret <vscale x 16 x bfloat> %r
 }
 
 define <vscale x 16 x bfloat> @insertelt_nxv16bf16_imm(<vscale x 16 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv16bf16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v12, a0
-; CHECK-NEXT:    vslideup.vi v8, v12, 3
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv16bf16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v12, a0
+; ZVFH-NEXT:    vslideup.vi v8, v12, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv16bf16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v12, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v12, 3
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv16bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v12, fa0
+; ZVFBFA-NEXT:    vslideup.vi v8, v12, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 16 x bfloat> %v, bfloat %elt, i32 3
   ret <vscale x 16 x bfloat> %r
 }
 
 define <vscale x 16 x bfloat> @insertelt_nxv16bf16_idx(<vscale x 16 x bfloat> %v, bfloat %elt, i32 zeroext %idx) {
-; CHECK-LABEL: insertelt_nxv16bf16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v12, a1
-; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v12, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv16bf16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.s.x v12, a1
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m4, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v12, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv16bf16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v12, a1
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v12, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv16bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.s.f v12, fa0
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16alt, m4, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v12, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 16 x bfloat> %v, bfloat %elt, i32 %idx
   ret <vscale x 16 x bfloat> %r
 }
 
 define <vscale x 32 x bfloat> @insertelt_nxv32bf16_0(<vscale x 32 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv32bf16_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv32bf16_0:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v8, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv32bf16_0:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v8, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv32bf16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v8, fa0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 32 x bfloat> %v, bfloat %elt, i32 0
   ret <vscale x 32 x bfloat> %r
 }
 
 define <vscale x 32 x bfloat> @insertelt_nxv32bf16_imm(<vscale x 32 x bfloat> %v, bfloat %elt) {
-; CHECK-LABEL: insertelt_nxv32bf16_imm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v16, a0
-; CHECK-NEXT:    vslideup.vi v8, v16, 3
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv32bf16_imm:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFH-NEXT:    vmv.s.x v16, a0
+; ZVFH-NEXT:    vslideup.vi v8, v16, 3
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv32bf16_imm:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFHMIN-NEXT:    vmv.s.x v16, a0
+; ZVFHMIN-NEXT:    vslideup.vi v8, v16, 3
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv32bf16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16alt, m1, tu, ma
+; ZVFBFA-NEXT:    vfmv.s.f v16, fa0
+; ZVFBFA-NEXT:    vslideup.vi v8, v16, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 32 x bfloat> %v, bfloat %elt, i32 3
   ret <vscale x 32 x bfloat> %r
 }
 
 define <vscale x 32 x bfloat> @insertelt_nxv32bf16_idx(<vscale x 32 x bfloat> %v, bfloat %elt, i32 zeroext %idx) {
-; CHECK-LABEL: insertelt_nxv32bf16_idx:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v16, a1
-; CHECK-NEXT:    addi a1, a0, 1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: insertelt_nxv32bf16_idx:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.s.x v16, a1
+; ZVFH-NEXT:    addi a1, a0, 1
+; ZVFH-NEXT:    vsetvli zero, a1, e16, m8, tu, ma
+; ZVFH-NEXT:    vslideup.vx v8, v16, a0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: insertelt_nxv32bf16_idx:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.s.x v16, a1
+; ZVFHMIN-NEXT:    addi a1, a0, 1
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, tu, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v16, a0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv32bf16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfmv.s.f v16, fa0
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16alt, m8, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v16, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 32 x bfloat> %v, bfloat %elt, i32 %idx
   ret <vscale x 32 x bfloat> %r
 }
@@ -243,6 +529,13 @@ define <vscale x 1 x half> @insertelt_nxv1f16_0(<vscale x 1 x half> %v, half %el
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vmv.s.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv1f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v8, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 1 x half> %v, half %elt, i32 0
   ret <vscale x 1 x half> %r
 }
@@ -262,6 +555,14 @@ define <vscale x 1 x half> @insertelt_nxv1f16_imm(<vscale x 1 x half> %v, half %
 ; ZVFHMIN-NEXT:    vmv.s.x v9, a0
 ; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv1f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16, mf4, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v9, a0
+; ZVFBFA-NEXT:    vslideup.vi v8, v9, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 1 x half> %v, half %elt, i32 3
   ret <vscale x 1 x half> %r
 }
@@ -285,6 +586,16 @@ define <vscale x 1 x half> @insertelt_nxv1f16_idx(<vscale x 1 x half> %v, half %
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv1f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    fmv.x.h a2, fa0
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.s.x v9, a2
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v9, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 1 x half> %v, half %elt, i32 %idx
   ret <vscale x 1 x half> %r
 }
@@ -302,6 +613,13 @@ define <vscale x 2 x half> @insertelt_nxv2f16_0(<vscale x 2 x half> %v, half %el
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vmv.s.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv2f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v8, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 2 x half> %v, half %elt, i32 0
   ret <vscale x 2 x half> %r
 }
@@ -321,6 +639,14 @@ define <vscale x 2 x half> @insertelt_nxv2f16_imm(<vscale x 2 x half> %v, half %
 ; ZVFHMIN-NEXT:    vmv.s.x v9, a0
 ; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv2f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16, mf2, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v9, a0
+; ZVFBFA-NEXT:    vslideup.vi v8, v9, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 2 x half> %v, half %elt, i32 3
   ret <vscale x 2 x half> %r
 }
@@ -344,6 +670,16 @@ define <vscale x 2 x half> @insertelt_nxv2f16_idx(<vscale x 2 x half> %v, half %
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv2f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    fmv.x.h a2, fa0
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.s.x v9, a2
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v9, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 2 x half> %v, half %elt, i32 %idx
   ret <vscale x 2 x half> %r
 }
@@ -361,6 +697,13 @@ define <vscale x 4 x half> @insertelt_nxv4f16_0(<vscale x 4 x half> %v, half %el
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vmv.s.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv4f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v8, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 4 x half> %v, half %elt, i32 0
   ret <vscale x 4 x half> %r
 }
@@ -380,6 +723,14 @@ define <vscale x 4 x half> @insertelt_nxv4f16_imm(<vscale x 4 x half> %v, half %
 ; ZVFHMIN-NEXT:    vmv.s.x v9, a0
 ; ZVFHMIN-NEXT:    vslideup.vi v8, v9, 3
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv4f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v9, a0
+; ZVFBFA-NEXT:    vslideup.vi v8, v9, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 4 x half> %v, half %elt, i32 3
   ret <vscale x 4 x half> %r
 }
@@ -403,6 +754,16 @@ define <vscale x 4 x half> @insertelt_nxv4f16_idx(<vscale x 4 x half> %v, half %
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv4f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    fmv.x.h a2, fa0
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.s.x v9, a2
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v9, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 4 x half> %v, half %elt, i32 %idx
   ret <vscale x 4 x half> %r
 }
@@ -420,6 +781,13 @@ define <vscale x 8 x half> @insertelt_nxv8f16_0(<vscale x 8 x half> %v, half %el
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vmv.s.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv8f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v8, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 8 x half> %v, half %elt, i32 0
   ret <vscale x 8 x half> %r
 }
@@ -439,6 +807,14 @@ define <vscale x 8 x half> @insertelt_nxv8f16_imm(<vscale x 8 x half> %v, half %
 ; ZVFHMIN-NEXT:    vmv.s.x v10, a0
 ; ZVFHMIN-NEXT:    vslideup.vi v8, v10, 3
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv8f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v10, a0
+; ZVFBFA-NEXT:    vslideup.vi v8, v10, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 8 x half> %v, half %elt, i32 3
   ret <vscale x 8 x half> %r
 }
@@ -462,6 +838,16 @@ define <vscale x 8 x half> @insertelt_nxv8f16_idx(<vscale x 8 x half> %v, half %
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m2, tu, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v10, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv8f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.s.x v10, a1
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16, m2, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v10, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 8 x half> %v, half %elt, i32 %idx
   ret <vscale x 8 x half> %r
 }
@@ -479,6 +865,13 @@ define <vscale x 16 x half> @insertelt_nxv16f16_0(<vscale x 16 x half> %v, half
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vmv.s.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv16f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v8, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 16 x half> %v, half %elt, i32 0
   ret <vscale x 16 x half> %r
 }
@@ -498,6 +891,14 @@ define <vscale x 16 x half> @insertelt_nxv16f16_imm(<vscale x 16 x half> %v, hal
 ; ZVFHMIN-NEXT:    vmv.s.x v12, a0
 ; ZVFHMIN-NEXT:    vslideup.vi v8, v12, 3
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv16f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v12, a0
+; ZVFBFA-NEXT:    vslideup.vi v8, v12, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 16 x half> %v, half %elt, i32 3
   ret <vscale x 16 x half> %r
 }
@@ -521,6 +922,16 @@ define <vscale x 16 x half> @insertelt_nxv16f16_idx(<vscale x 16 x half> %v, hal
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m4, tu, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv16f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.s.x v12, a1
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16, m4, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v12, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 16 x half> %v, half %elt, i32 %idx
   ret <vscale x 16 x half> %r
 }
@@ -538,6 +949,13 @@ define <vscale x 32 x half> @insertelt_nxv32f16_0(<vscale x 32 x half> %v, half
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
 ; ZVFHMIN-NEXT:    vmv.s.x v8, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv32f16_0:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v8, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 32 x half> %v, half %elt, i32 0
   ret <vscale x 32 x half> %r
 }
@@ -557,6 +975,14 @@ define <vscale x 32 x half> @insertelt_nxv32f16_imm(<vscale x 32 x half> %v, hal
 ; ZVFHMIN-NEXT:    vmv.s.x v16, a0
 ; ZVFHMIN-NEXT:    vslideup.vi v8, v16, 3
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv32f16_imm:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; ZVFBFA-NEXT:    vmv.s.x v16, a0
+; ZVFBFA-NEXT:    vslideup.vi v8, v16, 3
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 32 x half> %v, half %elt, i32 3
   ret <vscale x 32 x half> %r
 }
@@ -580,6 +1006,16 @@ define <vscale x 32 x half> @insertelt_nxv32f16_idx(<vscale x 32 x half> %v, hal
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, tu, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v16, a0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: insertelt_nxv32f16_idx:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.s.x v16, a1
+; ZVFBFA-NEXT:    addi a1, a0, 1
+; ZVFBFA-NEXT:    vsetvli zero, a1, e16, m8, tu, ma
+; ZVFBFA-NEXT:    vslideup.vx v8, v16, a0
+; ZVFBFA-NEXT:    ret
   %r = insertelement <vscale x 32 x half> %v, half %elt, i32 %idx
   ret <vscale x 32 x half> %r
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
index fbc7311945c8b..7a63a4710c534 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
@@ -7,9 +7,8 @@ declare bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat>)
 define bfloat @intrinsic_vfmv.f.s_s_nxv1bf16(<vscale x 1 x bfloat> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
   %a = call bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat> %0)
@@ -21,9 +20,8 @@ declare bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat>)
 define bfloat @intrinsic_vfmv.f.s_s_nxv2bf16(<vscale x 2 x bfloat> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv2bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
   %a = call bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat> %0)
@@ -35,9 +33,8 @@ declare bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat>)
 define bfloat @intrinsic_vfmv.f.s_s_nxv4bf16(<vscale x 4 x bfloat> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv4bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
   %a = call bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat> %0)
@@ -49,9 +46,8 @@ declare bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat>)
 define bfloat @intrinsic_vfmv.f.s_s_nxv8bf16(<vscale x 8 x bfloat> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv8bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
   %a = call bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat> %0)
@@ -63,9 +59,8 @@ declare bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat>)
 define bfloat @intrinsic_vfmv.f.s_s_nxv16bf16(<vscale x 16 x bfloat> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv16bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
   %a = call bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat> %0)
@@ -77,9 +72,8 @@ declare bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat>)
 define bfloat @intrinsic_vfmv.f.s_s_nxv32bf16(<vscale x 32 x bfloat> %0) nounwind {
 ; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv32bf16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
 entry:
   %a = call bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat> %0)

From fb2563d137b839b13105ccb8cea1dc4655572744 Mon Sep 17 00:00:00 2001
From: Claire Fan <fanyungching@gmail.com>
Date: Thu, 13 Nov 2025 17:50:52 +0100
Subject: [PATCH 15/25] [BPF] add allows-misaligned-mem-access target feature
 (#167013)

This proposal adds a `cl::opt` CLI flag
`-bpf-allow-misaligned-mem-access` to BPF target that lets users enable
allowing misaligned memory accesses.

The motivation behind the proposal is user space eBPF VMs (interpreters
or JITs running in user space) typically run on real CPUs where
unaligned memory accesses are acceptable (or handled efficiently) and
can be enabled to simplify lowering and improve performance. In
contrast, kernel eBPF must obey verifier constraints and
platform-specific alignment restrictions.

A new CLI option keeps kernel behavior unchanged while giving userspace
VMs an explicit opt-in to enable more permissive codegen. It supports
both use-cases without diverging codebases.
---
 llvm/lib/Target/BPF/BPF.td                    |   4 +
 llvm/lib/Target/BPF/BPFISelLowering.cpp       |  20 ++
 llvm/lib/Target/BPF/BPFISelLowering.h         |   7 +
 llvm/lib/Target/BPF/BPFSubtarget.cpp          |   1 +
 llvm/lib/Target/BPF/BPFSubtarget.h            |   6 +
 llvm/test/CodeGen/BPF/unaligned_load_store.ll | 196 ++++++++++++++++++
 6 files changed, 234 insertions(+)
 create mode 100644 llvm/test/CodeGen/BPF/unaligned_load_store.ll

diff --git a/llvm/lib/Target/BPF/BPF.td b/llvm/lib/Target/BPF/BPF.td
index dff76ca07af51..a7aa6274f5ac1 100644
--- a/llvm/lib/Target/BPF/BPF.td
+++ b/llvm/lib/Target/BPF/BPF.td
@@ -27,6 +27,10 @@ def ALU32 : SubtargetFeature<"alu32", "HasAlu32", "true",
 def DwarfRIS: SubtargetFeature<"dwarfris", "UseDwarfRIS", "true",
                                "Disable MCAsmInfo DwarfUsesRelocationsAcrossSections">;
 
+def MisalignedMemAccess : SubtargetFeature<"allows-misaligned-mem-access",
+                                           "AllowsMisalignedMemAccess", "true",
+                                           "Allows misaligned memory access">;
+
 def : Proc<"generic", []>;
 def : Proc<"v1", []>;
 def : Proc<"v2", []>;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 3c61216cd9327..ecefd2379356a 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -206,6 +206,26 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   HasJmp32 = STI.getHasJmp32();
   HasJmpExt = STI.getHasJmpExt();
   HasMovsx = STI.hasMovsx();
+
+  AllowsMisalignedMemAccess = STI.getAllowsMisalignedMemAccess();
+}
+
+bool BPFTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
+                                                       MachineMemOperand::Flags,
+                                                       unsigned *Fast) const {
+  // allows-misaligned-mem-access is disabled
+  if (!AllowsMisalignedMemAccess)
+    return false;
+
+  // only allow misalignment for simple value types
+  if (!VT.isSimple())
+    return false;
+
+  // always assume fast mode when misalignment is allowed
+  if (Fast)
+    *Fast = true;
+
+  return true;
 }
 
 bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 3d6e7c70df28b..8607e4f8c9e69 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -32,6 +32,10 @@ class BPFTargetLowering : public TargetLowering {
   // with the given GlobalAddress is legal.
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
+                                      MachineMemOperand::Flags,
+                                      unsigned *) const override;
+
   BPFTargetLowering::ConstraintType
   getConstraintType(StringRef Constraint) const override;
 
@@ -61,6 +65,9 @@ class BPFTargetLowering : public TargetLowering {
   bool HasJmpExt;
   bool HasMovsx;
 
+  // Allows Misalignment
+  bool AllowsMisalignedMemAccess;
+
   SDValue LowerSDIVSREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 8f16fe5bfdb51..726f8f4b39827 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -69,6 +69,7 @@ void BPFSubtarget::initializeEnvironment() {
   HasStoreImm = false;
   HasLoadAcqStoreRel = false;
   HasGotox = false;
+  AllowsMisalignedMemAccess = false;
 }
 
 void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.h b/llvm/lib/Target/BPF/BPFSubtarget.h
index e870dfdc85ec9..24eff862224b0 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -63,6 +63,9 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   // whether we should enable MCAsmInfo DwarfUsesRelocationsAcrossSections
   bool UseDwarfRIS;
 
+  // whether we allows misaligned memory access
+  bool AllowsMisalignedMemAccess;
+
   // whether cpu v4 insns are enabled.
   bool HasLdsx, HasMovsx, HasBswap, HasSdivSmod, HasGotol, HasStoreImm,
       HasLoadAcqStoreRel, HasGotox;
@@ -87,6 +90,9 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   bool getHasJmp32() const { return HasJmp32; }
   bool getHasAlu32() const { return HasAlu32; }
   bool getUseDwarfRIS() const { return UseDwarfRIS; }
+  bool getAllowsMisalignedMemAccess() const {
+    return AllowsMisalignedMemAccess;
+  }
   bool hasLdsx() const { return HasLdsx; }
   bool hasMovsx() const { return HasMovsx; }
   bool hasBswap() const { return HasBswap; }
diff --git a/llvm/test/CodeGen/BPF/unaligned_load_store.ll b/llvm/test/CodeGen/BPF/unaligned_load_store.ll
new file mode 100644
index 0000000000000..b302a80d6cd4f
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/unaligned_load_store.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=bpfel -mattr=+allows-misaligned-mem-access -verify-machineinstrs %s -o - \
+; RUN:    | FileCheck --check-prefixes=ALL,MISALIGN %s
+; RUN: llc -mtriple=bpfeb -mattr=+allows-misaligned-mem-access -verify-machineinstrs %s -o - \
+; RUN:    | FileCheck --check-prefixes=ALL,MISALIGN %s
+
+; RUN: llc -mtriple=bpfel -verify-machineinstrs %s -o - \
+; RUN:    | FileCheck --check-prefixes=ALL,ALIGN %s
+; RUN: llc -mtriple=bpfeb -verify-machineinstrs %s -o - \
+; RUN:    | FileCheck --check-prefixes=ALL,ALIGN %s
+; NOTE:
+;   This test verifies that the new +bpf-allow-misaligned-mem-access
+;   feature allows the BPF backend to emit direct unaligned load/store
+;   instructions instead of byte-by-byte emulation sequences.
+
+; ---------------------------------------------------------------------
+; i8 load
+; ---------------------------------------------------------------------
+define i8 @test_load_i8(i8* %p) {
+; ALL-LABEL: test_load_i8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    w{{[0-9]+}} = *(u8 *)(r1 + 0)
+; ALL-NEXT:    exit
+  %v = load i8, i8* %p, align 1
+  ret i8 %v
+}
+
+; ---------------------------------------------------------------------
+; i8 store
+; ---------------------------------------------------------------------
+define void @test_store_i8(i8* %p, i8 %v) {
+; ALL-LABEL: test_store_i8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    *(u8 *)(r1 + 0) = w{{[0-9]+}}
+; ALL-NEXT:    exit
+  store i8 %v, i8* %p, align 1
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; i16 load
+; ---------------------------------------------------------------------
+define i16 @test_load_i16(i16* %p) {
+; MISALIGN-LABEL: test_load_i16:
+; MISALIGN:       # %bb.0:
+; MISALIGN:    w{{[0-9]+}} = *(u16 *)(r1 + 0)
+; MISALIGN:    exit
+;
+; ALIGN-LABEL: test_load_i16:
+; ALIGN:       # %bb.0:
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 0)
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 1)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 8
+; ALIGN-DAG:    w{{[0-9]+}} |= w{{[0-9]+}}
+; ALIGN:        exit
+  %v = load i16, i16* %p, align 1
+  ret i16 %v
+}
+
+; ---------------------------------------------------------------------
+; i16 store
+; ---------------------------------------------------------------------
+define void @test_store_i16(i16* %p, i16 %v) {
+; MISALIGN-LABEL: test_store_i16:
+; MISALIGN:       # %bb.0:
+; MISALIGN:    *(u16 *)(r1 + 0) = w{{[0-9]+}}
+; MISALIGN:    exit
+;
+; ALIGN-LABEL: test_store_i16:
+; ALIGN:       # %bb.0:
+; ALIGN-DAG:    *(u8 *)(r1 + 0) = w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} >>= 8
+; ALIGN-DAG:    *(u8 *)(r1 + 1) = w{{[0-9]+}}
+; ALIGN:        exit
+  store i16 %v, i16* %p, align 1
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; i32 load
+; ---------------------------------------------------------------------
+
+define i32 @test_load_i32(i32* %p) {
+; MISALIGN-LABEL: test_load_i32:
+; MISALIGN:       # %bb.0:
+; MISALIGN:    w{{[0-9]+}} = *(u32 *)(r1 + 0)
+; MISALIGN:    exit
+;
+; ALIGN-LABEL: test_load_i32:
+; ALIGN:       # %bb.0:
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 0)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 8
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 1)
+; ALIGN-DAG:    w{{[0-9]+}} |= w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 2)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 16
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 3)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 24
+; ALIGN:        exit
+  %v = load i32, i32* %p, align 1
+  ret i32 %v
+}
+
+; ---------------------------------------------------------------------
+; i32 store
+; ---------------------------------------------------------------------
+
+define void @test_store_i32(i32* %p, i32 %v) {
+; MISALIGN-LABEL: test_store_i32:
+; MISALIGN:       # %bb.0:
+; MISALIGN:    *(u32 *)(r1 + 0) = w{{[0-9]+}}
+; MISALIGN:    exit
+;
+; ALIGN-LABEL: test_store_i32:
+; ALIGN:       # %bb.0:
+; ALIGN-DAG:    w{{[0-9]+}} = w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} >>= 24
+; ALIGN-DAG:    *(u8 *)(r1 + 0) = w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} = w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} >>= 16
+; ALIGN-DAG:    *(u8 *)(r1 + 1) = w{{[0-9]+}}
+; ALIGN-DAG:    *(u8 *)(r1 + 2) = w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} >>= 8
+; ALIGN-DAG:    *(u8 *)(r1 + 3) = w{{[0-9]+}}
+; ALIGN:        exit
+  store i32 %v, i32* %p, align 1
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; i64 load
+; ---------------------------------------------------------------------
+
+define i64 @test_load_i64(i64* %p) {
+; MISALIGN-LABEL: test_load_i64:
+; MISALIGN:       # %bb.0:
+; MISALIGN:    r0 = *(u64 *)(r1 + 0)
+; MISALIGN:    exit
+;
+; ALIGN-LABEL: test_load_i64:
+; ALIGN:       # %bb.0:
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 0)
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 1)
+; ALIGN-DAG:    r{{[0-9]+}} <<= 8
+; ALIGN-DAG:    r{{[0-9]+}} |= r{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 2)
+; ALIGN-DAG:    r{{[0-9]+}} <<= 16
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 3)
+; ALIGN-DAG:    r{{[0-9]+}} <<= 24
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 4)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 8
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 5)
+; ALIGN-DAG:    w{{[0-9]+}} |= w{{[0-9]+}}
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 6)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 16
+; ALIGN-DAG:    w{{[0-9]+}} = *(u8 *)(r1 + 7)
+; ALIGN-DAG:    w{{[0-9]+}} <<= 24
+; ALIGN-DAG:    r{{[0-9]+}} <<= 32
+; ALIGN:        exit
+  %v = load i64, i64* %p, align 1
+  ret i64 %v
+}
+
+; ---------------------------------------------------------------------
+; i64 store
+; ---------------------------------------------------------------------
+
+define void @test_store_i64(i64* %p, i64 %v) {
+; MISALIGN-LABEL: test_store_i64:
+; MISALIGN:       # %bb.0:
+; MISALIGN:    *(u64 *)(r1 + 0) = r2
+; MISALIGN:    exit
+;
+; ALIGN-LABEL: test_store_i64:
+; ALIGN:       # %bb.0:
+; ALIGN-DAG:    *(u8 *)(r1 + 0) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} = r{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 56
+; ALIGN-DAG:    *(u8 *)(r1 + 1) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 48
+; ALIGN-DAG:    *(u8 *)(r1 + 2) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 40
+; ALIGN-DAG:    *(u8 *)(r1 + 3) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 32
+; ALIGN-DAG:    *(u8 *)(r1 + 4) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 24
+; ALIGN-DAG:    *(u8 *)(r1 + 5) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 16
+; ALIGN-DAG:    *(u8 *)(r1 + 6) = w{{[0-9]+}}
+; ALIGN-DAG:    r{{[0-9]+}} >>= 8
+; ALIGN-DAG:    *(u8 *)(r1 + 7) = w{{[0-9]+}}
+; ALIGN:        exit
+  store i64 %v, i64* %p, align 1
+  ret void
+}

From 39fbec06222ecb37b489d69b45d801efa264bf26 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Thu, 13 Nov 2025 16:52:04 +0000
Subject: [PATCH 16/25] [AArch64][llvm] Improve writeback reg handling for
 FEAT_MOPS (#167763)

As mentioned in comments for #164913, the `if()` statements here
can't be externally triggered, since these writeback registers are
passed in from the caller. So they should really be `assert()`s so
it's obvious we don't need testcases for them, and more optimal.
---
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 44 +++++++------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 7293b7fdb0d20..2730833ba06d9 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -5923,21 +5923,15 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
   case AArch64::CPYETWN:
   case AArch64::CPYETRN:
   case AArch64::CPYETN: {
-    MCRegister Xd_wb = Inst.getOperand(0).getReg();
-    MCRegister Xs_wb = Inst.getOperand(1).getReg();
-    MCRegister Xn_wb = Inst.getOperand(2).getReg();
+    // Xd_wb == op0, Xs_wb == op1, Xn_wb == op2
     MCRegister Xd = Inst.getOperand(3).getReg();
     MCRegister Xs = Inst.getOperand(4).getReg();
     MCRegister Xn = Inst.getOperand(5).getReg();
-    if (Xd_wb != Xd)
-      return Error(Loc[0],
-                   "invalid CPY instruction, Xd_wb and Xd do not match");
-    if (Xs_wb != Xs)
-      return Error(Loc[0],
-                   "invalid CPY instruction, Xs_wb and Xs do not match");
-    if (Xn_wb != Xn)
-      return Error(Loc[0],
-                   "invalid CPY instruction, Xn_wb and Xn do not match");
+
+    assert(Xd == Inst.getOperand(0).getReg() && "Xd_wb and Xd do not match");
+    assert(Xs == Inst.getOperand(1).getReg() && "Xs_wb and Xs do not match");
+    assert(Xn == Inst.getOperand(2).getReg() && "Xn_wb and Xn do not match");
+
     if (Xd == Xs)
       return Error(Loc[0], "invalid CPY instruction, destination and source"
                            " registers are the same");
@@ -5973,17 +5967,14 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
   case AArch64::MOPSSETGET:
   case AArch64::MOPSSETGEN:
   case AArch64::MOPSSETGETN: {
-    MCRegister Xd_wb = Inst.getOperand(0).getReg();
-    MCRegister Xn_wb = Inst.getOperand(1).getReg();
+    // Xd_wb == op0, Xn_wb == op1
     MCRegister Xd = Inst.getOperand(2).getReg();
     MCRegister Xn = Inst.getOperand(3).getReg();
     MCRegister Xm = Inst.getOperand(4).getReg();
-    if (Xd_wb != Xd)
-      return Error(Loc[0],
-                   "invalid SET instruction, Xd_wb and Xd do not match");
-    if (Xn_wb != Xn)
-      return Error(Loc[0],
-                   "invalid SET instruction, Xn_wb and Xn do not match");
+
+    assert(Xd == Inst.getOperand(0).getReg() && "Xd_wb and Xd do not match");
+    assert(Xn == Inst.getOperand(1).getReg() && "Xn_wb and Xn do not match");
+
     if (Xd == Xn)
       return Error(Loc[0], "invalid SET instruction, destination and size"
                            " registers are the same");
@@ -6007,16 +5998,13 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
   case AArch64::SETGOET:
   case AArch64::SETGOEN:
   case AArch64::SETGOETN: {
-    MCRegister Xd_wb = Inst.getOperand(0).getReg();
-    MCRegister Xn_wb = Inst.getOperand(1).getReg();
+    // Xd_wb == op0, Xn_wb == op1
     MCRegister Xd = Inst.getOperand(2).getReg();
     MCRegister Xn = Inst.getOperand(3).getReg();
-    if (Xd_wb != Xd)
-      return Error(Loc[0],
-                   "invalid SET instruction, Xd_wb and Xd do not match");
-    if (Xn_wb != Xn)
-      return Error(Loc[0],
-                   "invalid SET instruction, Xn_wb and Xn do not match");
+
+    assert(Xd == Inst.getOperand(0).getReg() && "Xd_wb and Xd do not match");
+    assert(Xn == Inst.getOperand(1).getReg() && "Xn_wb and Xn do not match");
+
     if (Xd == Xn)
       return Error(Loc[0], "invalid SET instruction, destination and size"
                            " registers are the same");

From 1c196452dd4afd90f0d239a6d9aeb31b136e3df4 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 13 Nov 2025 16:58:21 +0000
Subject: [PATCH 17/25] Revert "[compiler-rt][ARM] Optimized mulsf3 and divsf3"
 (#167906)

Reverts llvm/llvm-project#161546

One of the buildbots reported a cmake error I don't understand, and
which I didn't get in my own test builds:
```
CMake Error at /var/lib/buildbot/fuchsia-x86_64-linux/llvm-project/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake:23 (try_compile):
  COMPILE_DEFINITIONS specified on a srcdir type TRY_COMPILE
```

My best guess is that the thing I did in `CheckAssemblerFlag.cmake` only
works on some versions of cmake. But I don't understand the problem well
enough to fix it quickly, so I'm reverting the whole patch and will
reland it later.
---
 .../cmake/Modules/CheckAssemblerFlag.cmake    |  38 --
 compiler-rt/lib/builtins/CMakeLists.txt       |  45 --
 compiler-rt/lib/builtins/arm/divsf3.S         | 608 -----------------
 compiler-rt/lib/builtins/arm/fnan2.c          |  42 --
 compiler-rt/lib/builtins/arm/fnorm2.c         |  62 --
 compiler-rt/lib/builtins/arm/funder.c         |  78 ---
 compiler-rt/lib/builtins/arm/mulsf3.S         | 309 ---------
 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S  | 251 -------
 compiler-rt/test/builtins/CMakeLists.txt      |   4 -
 compiler-rt/test/builtins/Unit/divsf3_test.c  | 503 +++-----------
 compiler-rt/test/builtins/Unit/mulsf3_test.c  | 616 ------------------
 11 files changed, 95 insertions(+), 2461 deletions(-)
 delete mode 100644 compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake
 delete mode 100644 compiler-rt/lib/builtins/arm/divsf3.S
 delete mode 100644 compiler-rt/lib/builtins/arm/fnan2.c
 delete mode 100644 compiler-rt/lib/builtins/arm/fnorm2.c
 delete mode 100644 compiler-rt/lib/builtins/arm/funder.c
 delete mode 100644 compiler-rt/lib/builtins/arm/mulsf3.S
 delete mode 100644 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
 delete mode 100644 compiler-rt/test/builtins/Unit/mulsf3_test.c

diff --git a/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake b/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake
deleted file mode 100644
index 49e8b8547c5cd..0000000000000
--- a/compiler-rt/cmake/Modules/CheckAssemblerFlag.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-# Helper function to find out whether the assembler supports a particular
-# command-line flag. You'd like to use the standard check_compiler_flag(), but
-# that only supports a fixed list of languages, and ASM isn't one of them. So
-# we do it ourselves, by trying to assemble an empty source file.
-
-function(check_assembler_flag outvar flag)
-  if(NOT DEFINED "${outvar}")
-    if(NOT CMAKE_REQUIRED_QUIET)
-      message(CHECK_START "Checking for assembler flag ${flag}")
-    endif()
-
-    # Stop try_compile from attempting to link the result of the assembly, so
-    # that we don't depend on having a working linker, and also don't have to
-    # figure out what special symbol like _start needs to be defined in the
-    # test input.
-    #
-    # This change is made within the dynamic scope of this function, so
-    # CMAKE_TRY_COMPILE_TARGET_TYPE will be restored to its previous value on
-    # return.
-    set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
-
-    # Try to assemble an empty file with a .S name, using the provided flag.
-    try_compile(success
-      SOURCE_FROM_CONTENT "CheckAssemblerFlag.s" ""
-      COMPILE_DEFINITIONS ${flag}
-      NO_CACHE)
-
-    if(NOT CMAKE_REQUIRED_QUIET)
-      if(success)
-        message(CHECK_PASS "Accepted")
-        set(${outvar} 1 CACHE INTERNAL "Test assembler flag ${flag}")
-      else()
-        message(CHECK_FAIL "Not accepted")
-        set(${outvar} "" CACHE INTERNAL "Test assembler flag ${flag}")
-      endif()
-    endif()
-  endif()
-endfunction()
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 6f5c2cd7d1971..02e6ecfbdb60e 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -60,7 +60,6 @@ endif()
 include(builtin-config-ix)
 include(CMakeDependentOption)
 include(CMakePushCheckState)
-include(CheckAssemblerFlag)
 
 option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS
   "Do not export any symbols from the static library." ON)
@@ -424,40 +423,6 @@ set(arm_or_thumb2_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
-option(COMPILER_RT_ARM_OPTIMIZED_FP
-  "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON)
-
-if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
-  check_assembler_flag(COMPILER_RT_HAS_MIMPLICIT_IT -mimplicit-it=always)
-  if(COMPILER_RT_HAS_MIMPLICIT_IT)
-    set(implicit_it_flag -mimplicit-it=always)
-  else()
-    check_assembler_flag(
-      COMPILER_RT_HAS_WA_MIMPLICIT_IT -Wa,-mimplicit-it=always)
-    if(COMPILER_RT_HAS_WA_MIMPLICIT_IT)
-      set(implicit_it_flag -Wa,-mimplicit-it=always)
-    else()
-      message(WARNING "Don't know how to set the -mimplicit-it=always flag in this assembler; not including Arm optimized implementations")
-      set(implicit_it_flag "")
-    endif()
-  endif()
-
-  if(implicit_it_flag)
-    set(assembly_files
-      arm/mulsf3.S
-      arm/divsf3.S)
-    set_source_files_properties(${assembly_files}
-      PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
-    set(arm_or_thumb2_base_SOURCES
-      ${assembly_files}
-      arm/fnan2.c
-      arm/fnorm2.c
-      arm/funder.c
-      ${arm_or_thumb2_base_SOURCES}
-      )
-  endif()
-endif()
-
 set(arm_sync_SOURCES
   arm/sync_fetch_and_add_4.S
   arm/sync_fetch_and_add_8.S
@@ -491,16 +456,6 @@ set(thumb1_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
-if(COMPILER_RT_ARM_OPTIMIZED_FP)
-  set(thumb1_base_SOURCES
-    arm/thumb1/mulsf3.S
-    arm/fnan2.c
-    arm/fnorm2.c
-    arm/funder.c
-    ${thumb1_base_SOURCES}
-  )
-endif()
-
 set(arm_EABI_RT_SOURCES
   arm/aeabi_cdcmp.S
   arm/aeabi_cdcmpeq_check_nan.c
diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S
deleted file mode 100644
index 2f37234457b7b..0000000000000
--- a/compiler-rt/lib/builtins/arm/divsf3.S
+++ /dev/null
@@ -1,608 +0,0 @@
-//===-- divsf3.S - single-precision floating point division ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements single-precision soft-float division with the IEEE-754
-// default rounding (to nearest, ties to even), in optimized AArch32 assembly
-// language suitable to be built as either Arm or Thumb2.
-//
-//===----------------------------------------------------------------------===//
-
-#include "../assembly.h"
-
-
-  .syntax unified
-  .text
-  .p2align 2
-
-DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fdiv, __divsf3)
-
-DEFINE_COMPILERRT_FUNCTION(__divsf3)
-  // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23
-  // of each register so that there will be space lower down to store extra
-  // data without exponent arithmetic carrying into it. In the process, check
-  // both exponents for 00 or FF and branch out of line to handle all the
-  // uncommon types of value (infinity, NaN, zero, denormals).
-  //
-  // Chaining conditional instructions like this means that the second
-  // instruction (setting up r3) might not be executed at all, so fdiv_uncommon
-  // will have to redo it just in case. That saves an instruction here,
-  // executed for _all_ inputs, and moves it to the uncommon path run for only
-  // some inputs.
-  mov     r12, #0xFF0000
-  ands    r2, r12, r0, lsr #7   // r2 has exponent of numerator. (Is it 0?)
-  andsne  r3, r12, r1, lsr #7   // r3 has exponent of denominator. (Is it 0?)
-  teqne   r2, r12               // if neither was 0, is one FF?
-  teqne   r3, r12               // or the other?
-  beq     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
-
-  // Calculate the output sign, which is always just the XOR of the input
-  // signs. Store it in bit 8 of r2, below the numerator exponent.
-  teq     r0, r1                // is the output sign bit 1?
-  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
-
-  // Isolate the mantissas of both values, by setting bit 23 of each one and
-  // clearing the 8 bits above that.
-  //
-  // In the process, swap the register allocations (which doesn't cost extra
-  // instructions if we do it as part of this manipulation). We want the
-  // numerator not to be in r0, because r0 is where we'll build up the quotient
-  // while subtracting things from the numerator.
-  orr     r12, r0, #1 << 23
-  orr     r0, r1, #1 << 23
-  bic     r1, r12, #0xFF000000
-  bic     r0, r0, #0xFF000000
-
-LOCAL_LABEL(div):
-  // Start of the main division. We get here knowing that:
-  //
-  //   r0 = mantissa of denominator, with the leading 1 at bit 23
-  //   r1 = mantissa of numerator, similarly
-  //   r2 = (exponent of numerator << 16) + (result sign << 8)
-  //   r3 = (exponent of denominator << 16)
-
-  push    {r14}                 // we'll need an extra register
-
-  // Calculate the initial result exponent by just subtracting the two input
-  // exponents. This doesn't affect the sign bit lower down in r2.
-  sub     r2, r2, r3
-
-  // That initial exponent might need to be adjusted by 1, depending on whether
-  // dividing the mantissas gives a value >=1 or <1. We don't need to wait
-  // until the division is finished to work that out: we can tell immediately
-  // by just comparing the mantissas.
-  //
-  // The basic idea is to do the comparison in a way that sets the C flag if
-  // numerator >= denominator. Then we recombine the sign and exponent by doing
-  // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted
-  // down to the low 8 bits, just below the sign bit, and using ADC rather than
-  // ADD folds in the conditional increment from the mantissa comparison.
-  //
-  // If we're not incrementing the output exponent, we instead shift the
-  // numerator mantissa left by 1, so that it _is_ greater than the denominator
-  // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23.
-  //
-  // The exponent also needs to be rebiased, so that dividing two numbers the
-  // same gives an output exponent of 0x7F. If the two inputs have the same
-  // exponent then we'll have computed an exponent of 0 via the SUB instruction
-  // above; if the mantissas are the same as well then the ADC will increment
-  // it; also, the leading bit of the quotient will increment the exponent
-  // again when we recombine it with the output mantissa later. So we need to
-  // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB
-  // come to 0x7F after both of those increments.
-  //
-  // Putting all of that together, what we _want_ to do is this:
-  //
-  // [#1]   CMP     r1, r0                // set C if num >= den
-  // [#2]   MOVLO   r1, r1, lsl #1        // if num < den, shift num left
-  // [#3]   ADD     r2, r2, #0x7D0000     // rebias exponent
-  // [#4]   ADC     r2, r2, r2, asr #16   // combine sign + exp + adjustment
-  //
-  // However, we only do the first of those four instructions right here. The
-  // other three are distributed through the code below, after unrelated load
-  // or multiply instructions which will have a result delay slot on simple
-  // CPUs. Each is labelled "exponent setup [#n]" in a comment.
-  //
-  // (Since instruction #4 depends on the flags set up by #2, we must avoid
-  // clobbering the flags in _any_ of the instructions interleaved with this!)
-  cmp     r1, r0                // exponent setup [#1]
-
-  // Start the mantissa division by making an approximation to the reciprocal
-  // of the denominator. We first obtain an 8-bit approximation using a table
-  // lookup indexed by the top 7 denominator bits (counting the leading 1, so
-  // really there are only 6 bits in the table index).
-  //
-  // (r0 >> 17) is the table index, and its top bit is always set, so it ranges
-  // from 64 to 127 inclusive. So we point the base register 64 bytes before
-  // the actual table.
-  adr     r12, LOCAL_LABEL(tab) - 64
-#if __thumb__
-  // Thumb can't do this particular shift+add+load in one instruction - it only
-  // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must
-  // calculate the load offset separately.
-  add     r14, r12, r0, lsr #17
-  ldrb    r14, [r14]
-#else
-  ldrb    r14, [r12, r0, lsr #17]
-#endif
-
-  // Now do an iteration of Newton-Raphson to improve that 8-bit approximation
-  // to have 15-16 accurate bits.
-  //
-  // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d
-  // and you have some approximation x, your next approximation is X = x(2-dx).
-  // Looked at one way, this is the result of applying the N-R formula
-  // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it
-  // is to suppose that dx = 1 - e, for some e which is small (because dx is
-  // already reasonably close to 1). Then you want to double the number of
-  // correct bits in the next approximation, i.e. square the error. So you want
-  // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again.
-  //
-  // In this situation, we're working in fixed-point integers rather than real
-  // numbers, and all the scales are different:
-  //  * our input denominator d is in the range [2^23,2^24)
-  //  * our input approximation x is in the range [2^7,2^8)
-  //  * we want the output approximation to be in the range [2^15,2^16)
-  // Those factors combine to mean that we want
-  //   x(2^32-dx) / 2^23
-  // = (2^9 x) - (dx^2 / 2^23)
-  //
-  // But we also want to compute this using ordinary MUL, not a long multiply
-  // instruction (those are slower). So we need to worry about the product
-  // overflowing. dx fits in 32 bits, because it's the product of something
-  // <2^24 with something <2^8; but we must shift it right before multiplying
-  // by x again.
-
-  mul     r12, r0, r14          // r12  = dx
-  movlo   r1, r1, lsl #1        //   exponent setup [#2] in the MUL delay slot
-  mvn     r12, r12, lsr #8      // r12 ~= -dx/2^8
-  mul     r3, r12, r14          // r3  ~= -dx^2/2^8
-  mov     r14, r14, lsl #9      // r14  = 2^9 x
-  add     r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23
-
-  // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa,
-  // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16
-  // in principle, and the max mantissa 2^24-1 would have reciprocal just over
-  // 2^15). The error is always negative (r14 is an underestimate of the true
-  // value), and the maximum error is 6 and a bit ULP (that is, the true
-  // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly
-  // less than 0x10000 (even in the case of the min mantissa, where the true
-  // value would be _exactly_ 0x10000), which eliminates a case of integer
-  // overflow.
-  //
-  // All of these properties of the reciprocal approximation are checked by
-  // exhaustively iterating over all 2^23 possible input mantissas. (The nice
-  // thing about doing this in single rather than double precision!)
-  //
-  // Now we extract most of the quotient by two steps of long division, using
-  // the reciprocal estimate to identify a multiple of the denominator to
-  // subtract from the numerator. To avoid integer overflow, the numerator
-  // mantissa is shifted down 8 bits so that it's less than 0x10000. After we
-  // calculate an approximate quotient, we shift the numerator left and
-  // subtract that multiple of the denominator, moving the next portion of the
-  // numerator into range for the next iteration.
-
-  // First iteration of long division. We shift the numerator left 11 bits, and
-  // since the quotient approximation is scaled by 2^31, we must shift that
-  // right by 20 to make the right product to subtract from the numerator.
-  mov     r12, r1, lsr #8       // shift the numerator down
-  mul     r12, r14, r12         // make the quotient approximation
-  mov     r1, r1, lsl #11       // shift numerator left, ready for subtraction
-  mov     r3, r12, lsr #20      // make first 12-bit block of quotient bits
-  mls     r1, r0, r3, r1        // subtract that multiple of den from num
-
-  add     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
-
-  // Second iteration of long division. Differences from the first step: this
-  // time we shift the numerator 12 bits instead of 11, so that the total of
-  // both steps is 23 bits, i.e. we've shifted up by exactly the full width of
-  // the output mantissa. Also, the block of output quotient bits is left in a
-  // different register: it was in r3 the first time, and this time it's in
-  // r12, so that we still have both available at the end of the process.
-  mov     r12, r1, lsr #8       // shift the numerator down
-  mul     r12, r14, r12         // make the quotient approximation
-  mov     r1, r1, lsl #12       // shift numerator left, ready for subtraction
-  mov     r12, r12, lsr #19     // make second 11-bit block of quotient
-  mls     r1, r0, r12, r1       // subtract that multiple of den from num
-
-  adc     r2, r2, r2, asr #16   //   exponent setup [#4] in the MLS delay slot
-
-  // Now r1 contains the original numerator, shifted left 23, minus _some_
-  // multiple of the original denominator (which is still in r0). The bounds on
-  // the error in the above steps should make the error at most 1: that is, we
-  // may have to subtract the denominator one more time to make r1 < r0, and
-  // increment the quotient by one more.
-  //
-  // Our quotient is still in two pieces, computed separately in the above long
-  // division steps. We fold the final increment into the same instruction that
-  // recombines them, by doing the comparison in such a way that it sets the
-  // carry flag if the increment is needed.
-
-  cmp     r1, r0                // Set carry flag if num >= den
-  subhs   r1, r1, r0            // If so, subtract den from num
-  adc     r3, r12, r3, lsl #12  // Recombine quotient halves, plus optional +1
-
-  // We've finished with r14 as a temporary register, so we can unstack it now.
-  pop     {r14}
-
-  // Now r3 contains the _rounded-down_ output quotient, and r1 contains the
-  // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and
-  // 0 <= r1 < denominator.
-  //
-  // Next we must round to nearest, by checking if r1 is greater than half the
-  // denominator. In division, it's not possible to hit an exact round-to-even
-  // halfway case, so we don't need to spend any time checking for it.
-  //
-  // Proof of no round-to-even: define the 'width' of a dyadic rational to be
-  // the distance between the lowest and highest 1 bits in its binary
-  // representation, or equivalently, the index of its high bit if you scale it
-  // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has
-  // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3.
-  // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w
-  // be the maximum width that the input precision supports (so that for single
-  // precision, w=23). Then if some division n/d were a round-to-even case, the
-  // true quotient q=n/d would have width exactly w+1. But we have qd=n, so
-  // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the
-  // input precision, hence had width <= w.)
-  //
-  // So we don't need to check for an exact _halfway_ case and clear the low
-  // bit of the quotient after rounding up, as addition and multiplication both
-  // need to do. But we do need to remember if the quotient itself was exact,
-  // that is, if there was no remainder at all. That's needed in underflow
-  // handling.
-
-  // The rounding check wants to compare remainder with denominator/2. But of
-  // course in integers it's easier to compare 2*remainder with denominator. So
-  // we start by shifting the remainder left by 1, and in the process, set Z if
-  // it's exactly 0 (i.e. the result needs no rounding at all).
-  lsls    r1, r1, #1
-  // Now trial-subtract the denominator. We don't do this at all if the result
-  // was exact. If we do do it, r1 goes negative precisely if we need to round
-  // up, which sets the C flag. (The previous instruction will have left C
-  // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if
-  // we're rounding up.)
-  subsne  r1, r1, r0
-  // Recombine the quotient with the sign + exponent, and use the C flag from
-  // the previous instruction to increment the quotient if we're rounding up.
-  adc     r0, r3, r2, lsl #23
-
-  // If we haven't either overflowed or underflowed, we're done. We can
-  // identify most of the safe cases by doing an unsigned comparison of the
-  // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 <
-  // 0xFC0000 then we have neither underflow nor overflow.
-  //
-  // Rationale: the value in the top half of r2 had three chances to be
-  // incremented before becoming the exponent field of the actual output float.
-  // It was incremented if we found the numerator mantissa was >= the
-  // denominator (producing the value in the _bottom_ half of r2, which we just
-  // ADCed into the output). Then it gets unconditionally incremented again
-  // when the ADC combines it with the leading mantissa bit. And finally,
-  // round-up might increment it a third time. So 0xFC is the smallest value
-  // that can possibly turn into the overflowed value 0xFF after all those
-  // increments.
-  //
-  // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in
-  // the final result's exponent field (and then rounding might increase it
-  // further); if the exponent was less than that then r2 wraps round and looks
-  // like a very large positive integer from the point of view of this unsigned
-  // comparison.
-  cmp     r2, #0xFC0000
-  bxlo    lr
-
-  // The same comparison will have set the N and V flags to reflect the result
-  // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably
-  // distinguishes potential underflow (r2 is negative) from potential overflow
-  // (r2 is positive and at least 0xFC0000)
-  bge     LOCAL_LABEL(overflow)
-
-  // Here we might or might not have underflow (but we know we don't have
-  // overflow). To check more carefully, we look at the _bottom_ half of r2,
-  // which contains the exponent after the first adjustment (for num >= denom),
-  // That is, it's still off by 1 (compensating for the leading quotient bit),
-  // and is also before rounding.
-  //
-  // We neglect the effect of rounding: division results that are tiny (less
-  // than the smallest normalised number) before rounding, but then round up to
-  // the smallest normal number, are an acceptable edge case to handle slowly.
-  // We pass those to funder without worrying about them.
-  //
-  // So we want to check whether the bottom half of r2 was negative. It would
-  // be nice to check bits 8-15 of it, but unfortunately, it's already been
-  // combined with the sign (at bit 8), so those bits don't tell us anything
-  // useful. Instead we look at the top 4 bits of the exponent field, i.e. the
-  // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is
-  // less than 3, so it doesn't reach those bits; the smallest possible
-  // underflow, obtained by dividing the smallest denormal by the largest
-  // finite number, is -151 (before the leading bit increments it), which will
-  // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be
-  // 0x60 or greater for a (pre-rounding) underflow, and zero for a
-  // non-underflow.
-
-  tst     r2, #0xF0
-  bxeq    lr                    // no underflow after all; return
-
-  // Rebias the exponent for funder, which also corrects the sign bit.
-  add     r0, r0, #192 << 23
-  // Tell funder whether the true value is greater or less than the number in
-  // r0. This is obtained from the sign of the remainder (still in r1), with
-  // the only problem being that it's currently reversed. So negate r1 (leaving
-  // 0 at 0 to indicate exactness).
-  rsbs    r1, r1, #0
-  b     SYMBOL_NAME(__compiler_rt_funder)
-
-LOCAL_LABEL(overflow):
-  // Here we might or might not have overflow (but we know we don't have
-  // underflow). We must check whether we really have overflowed.
-  //
-  // For this it's easiest to check the exponent field in the actual output
-  // value in r0, after _all_ the adjustments have been completed. The largest
-  // overflowed exponent is 0x193, and the smallest exponent that can reach
-  // this is 0xFD (we checked against 0xFC above, but then the leading quotient
-  // bit incremented it). So it's enough to shift the output left by one
-  // (moving the exponent field to the top), increment it once more (so that
-  // the smallest overflowed exponent 0xFF wraps round to 0), and then compare
-  // against 0xFE000000 as an unsigned integer.
-  mov     r12, r0, lsl #1
-  add     r12, r12, #1 << 24
-  cmp     r12, #0xFE << 24      // Check for exp = 253 or 254
-  bxhs    lr
-  // We have actual overflow. Rebias r0 to bring the exponent back into range,
-  // which ensures its sign is correct. Then make an infinity of that sign to
-  // return.
-  subs    r0, r0, #0xC0 << 23
-  movs    r12, #0xFF            // exponent of infinity
-  orrs    r12, r12, r0, lsr #23 // exponent and sign at bottom of r12
-  movs    r0, r12, lsl #23      // shift it up to the top of r0 to return
-  bx      lr
-
-LOCAL_LABEL(uncommon):
-  // We come here from the start of the function if either input is an uncommon
-  // value: zero, denormal, infinity or NaN.
-  //
-  // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x
-  // in bits 16..23. But r3 doesn't necessarily contain the exponent of y,
-  // because the instruction that set it up was conditional. So first we
-  // unconditionally repeat it.
-  and     r3, r12, r1, lsr #7
-
-  // In all cases not involving a NaN as output, the sign of the output is made
-  // in the same way as for finite numbers, as the XOR of the input signs. So
-  // repeat the sign setup from the main branch.
-  teq     r0, r1                // is the output sign bit 1?
-  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
-
-  // Detect infinities and NaNs, by checking if either of r2 or r3 is at least
-  // 0xFF0000.
-  cmp     r2, #0xFF0000
-  cmplo   r3, #0xFF0000
-  bhs     LOCAL_LABEL(inf_NaN)
-
-  // Now we know there are no infinities or NaNs, but there's at least one zero
-  // or denormal.
-  movs    r12, r1, lsl #1       // is y zero?
-  beq     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
-  movs    r12, r0, lsl #1       // is x zero? (now we know that y is not)
-  moveq   r0, r2, lsl #23       // if so, 0/nonzero is just 0 (of right sign)
-  bxeq    lr
-
-  // Now we've eliminated zeroes as well, leaving only denormals: either x or
-  // y, or both, is a denormal. Call fnorm2 to convert both into a normalised
-  // mantissa and a (potentially small) exponent.
-  and     r12, r2, #0x100       // save the result sign from r2
-  lsr     r2, #16               // shift extracted exponents down to bit 0
-  lsr     r3, #16               // where fnorm2 will expect them
-  push    {r0, r1, r2, r3, r12, lr}
-  mov     r0, sp                // tell fnorm2 where to find its data
-  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
-  pop     {r0, r1, r2, r3, r12, lr}
-  lsl     r3, #16               // shift exponents back up to bit 16
-  orr     r2, r12, r2, lsl #16  // and put the result sign back in r2
-
-  // Now rejoin the main code path, having finished the setup it will expect:
-  // swap x and y, and shift the fractions back down to the low 24 bits.
-  mov     r12, r0, lsr #8
-  mov     r0, r1, lsr #8
-  mov     r1, r12
-  b       LOCAL_LABEL(div)
-
-LOCAL_LABEL(inf_NaN):
-  // We come here if at least one input is a NaN or infinity. If either or both
-  // inputs are NaN then we hand off to fnan2 to propagate a NaN from the
-  // input.
-  mov     r12, #0xFF000000
-  cmp     r12, r0, lsl #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
-  blo     SYMBOL_NAME(__compiler_rt_fnan2)
-  cmp     r12, r1, lsl #1
-  blo     SYMBOL_NAME(__compiler_rt_fnan2)
-
-  // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and
-  // finite/inf = 0.
-
-  // If both operands are infinity, we return a NaN. Since we know at
-  // least _one_ is infinity, we can test this by checking if they're
-  // equal apart from the sign bits.
-  eor     r3, r0, r1
-  lsls    r3, #1                // were all bits of XOR zero other than top?
-  beq     LOCAL_LABEL(invalid)          // if so, both operands are infinity
-
-  // See if x is infinite
-  cmp     r12, r0, lsl #1       // (r0 << 1) == 0xFF000000?
-  beq     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
-
-  // y is infinite and x is not, so we return a zero of the
-  // combined sign.
-  eor     r0, r0, r1            // calculate the right sign
-  and     r0, r0, #0x80000000   // throw away everything else
-  bx      lr
-
-LOCAL_LABEL(divbyzero):
-  // Here, we know y is zero. But we don't know if x is zero or nonzero. So we
-  // might be calculating 0/0 (invalid operation, generating a NaN), or
-  // nonzero/0 (the IEEE "division by zero" exception, generating infinity).
-  movs    r12, r0, lsl #1       // is x zero too?
-  beq     LOCAL_LABEL(invalid)          // if so, go and return a NaN
-
-LOCAL_LABEL(infret):
-  // Here, we're either dividing infinity by a finite number, or dividing a
-  // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all
-  // these cases we return infinity with the sign from r2.
-  //
-  // If we were implementing IEEE exceptions, we'd have to separate these
-  // cases: infinity / finite is not an _exception_, it just returns infinity,
-  // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here
-  // we're not implementing exceptions, so we can treat all three cases the
-  // same.
-  //
-  // r2 contains the output sign in bit 8, which is a convenient place to find
-  // it when making an infinity, because we can fill in the 8 exponent bits
-  // below that and then shift it left.
-  orr     r2, r2, #0xff         // sign + maximum exponent
-  lsl     r0, r2, #23           // shift up to the top
-  bx      lr
-
-LOCAL_LABEL(invalid):
-  // Return the default NaN, from an invalid operation (either dividing
-  // infinity by infinity, or 0 by 0).
-  ldr     r0, =0x7FC00000
-  bx      lr
-
-// Finally, the lookup table for the initial reciprocal approximation.
-//
-// The table index is made from the top 7 bits of the denominator mantissa. But
-// the topmost bit is always 1, so only the other 6 bits vary. So it only has
-// 64 entries, not 128.
-//
-// Each table entry is a single byte, with its top bit set. So the table
-// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by
-// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31.
-//
-// Each of these 64 entries corresponds to a large interval of possible
-// mantissas. For example, if the top 7 bits are 1000001 then the overall
-// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output
-// of this table provides more bits than the input, there are several choices
-// of 8-bit reciprocal approximation for a number in that interval. The
-// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal
-// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable
-// choices for that table entry: F9, FA, FB or FC. Which do we pick?
-//
-// The table below is generated by choosing whichever value minimises the
-// maximum possible error _after_ the approximation is improved by the
-// Newton-Raphson step. In the example above, we end up with FA.
-//
-// The Python code below will regenerate the table, complete with the per-entry
-// comments.
-
-/*
-
-for prefix in range(64, 128):
-    best = None
-
-    # Max and min 23-bit mantissas with this 7-bit prefix
-    mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1
-
-    # Max and min table entry corresponding to the reciprocal of something in
-    # that range of mantissas: round up the reciprocal of mmax, and round down
-    # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because
-    # 0x100 can't be used as a table entry due to not fitting in a byte, even
-    # though it's the exact reciprocal of the overall-smallest mantissa
-    # 0x800000.
-    gmin = max(128, (2**31 + mmin - 1) // mmax)
-    gmax = min(255, 2**31 // mmin)
-
-    # For each of those table entries, compute the result of starting from that
-    # value and doing a Newton-Raphson iteration, with the mantissa at each end
-    # of the mantissa interval. One of these will be the worst possible error.
-    # Choose the table entry whose worst error is as small as possible.
-    #
-    # (To find the extreme values of a more general function on an interval,
-    # you must consider its values not only at the interval endpoints but also
-    # any turning points within the interval. Here, the function has only one
-    # turning point, and by construction it takes value 0 there, so we needn't
-    # worry.)
-    g = max(
-        range(gmin, gmax + 1),
-        key=lambda g: min(
-            (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax]
-        ),
-    )
-
-    print(f"  .byte 0x{g:02x}  // input [0x{mmin:06x},0x{mmax:06x}]"
-          f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]"
-    )
-
-*/
-
-  .p2align 2  // make sure we start on a 4-byte boundary, even in Thumb
-LOCAL_LABEL(tab):
-  .byte 0xfe  // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff]
-  .byte 0xfa  // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc]
-  .byte 0xf6  // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8]
-  .byte 0xf3  // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4]
-  .byte 0xef  // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0]
-  .byte 0xec  // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed]
-  .byte 0xe8  // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea]
-  .byte 0xe5  // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6]
-  .byte 0xe2  // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3]
-  .byte 0xdf  // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0]
-  .byte 0xdc  // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd]
-  .byte 0xd9  // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda]
-  .byte 0xd6  // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7]
-  .byte 0xd3  // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4]
-  .byte 0xd1  // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2]
-  .byte 0xce  // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf]
-  .byte 0xcc  // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc]
-  .byte 0xc9  // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca]
-  .byte 0xc7  // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7]
-  .byte 0xc4  // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5]
-  .byte 0xc2  // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3]
-  .byte 0xc0  // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0]
-  .byte 0xbd  // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe]
-  .byte 0xbb  // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc]
-  .byte 0xb9  // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba]
-  .byte 0xb7  // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8]
-  .byte 0xb5  // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6]
-  .byte 0xb3  // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4]
-  .byte 0xb1  // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2]
-  .byte 0xaf  // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0]
-  .byte 0xad  // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae]
-  .byte 0xac  // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac]
-  .byte 0xaa  // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa]
-  .byte 0xa8  // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8]
-  .byte 0xa6  // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7]
-  .byte 0xa5  // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5]
-  .byte 0xa3  // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3]
-  .byte 0xa1  // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2]
-  .byte 0xa0  // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0]
-  .byte 0x9e  // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f]
-  .byte 0x9d  // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d]
-  .byte 0x9b  // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c]
-  .byte 0x9a  // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a]
-  .byte 0x98  // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99]
-  .byte 0x97  // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97]
-  .byte 0x96  // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96]
-  .byte 0x94  // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94]
-  .byte 0x93  // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93]
-  .byte 0x92  // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92]
-  .byte 0x90  // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90]
-  .byte 0x8f  // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f]
-  .byte 0x8e  // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e]
-  .byte 0x8d  // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d]
-  .byte 0x8b  // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c]
-  .byte 0x8a  // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a]
-  .byte 0x89  // input [0xee0000,0xefffff], candidate outputs [0x89,0x89]
-  .byte 0x88  // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88]
-  .byte 0x87  // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87]
-  .byte 0x86  // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86]
-  .byte 0x85  // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85]
-  .byte 0x84  // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84]
-  .byte 0x83  // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83]
-  .byte 0x82  // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82]
-  .byte 0x81  // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81]
-
-END_COMPILERRT_FUNCTION(__divsf3)
-
-NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c
deleted file mode 100644
index 06bbd4339f171..0000000000000
--- a/compiler-rt/lib/builtins/arm/fnan2.c
+++ /dev/null
@@ -1,42 +0,0 @@
-//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This helper function is available for use by single-precision float
-// arithmetic implementations to handle propagating NaNs from the input
-// operands to the output, in a way that matches Arm hardware FP.
-//
-// On input, a and b are floating-point numbers in IEEE 754 encoding, and at
-// least one of them must be a NaN. The return value is the correct output NaN.
-//
-// A signalling NaN in the input (with bit 22 clear) takes priority over any
-// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If
-// both inputs are the same type of NaN then the first input takes priority:
-// the input a is used instead of b.
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
-  // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at
-  // the bit position where the quiet vs signalling bit ended up. This squashes
-  // all the signalling NaNs to the top of the range of 32-bit values, from
-  // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values
-  // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect
-  // a signalling NaN by asking if it's greater than 0xff800000, and a quiet
-  // one by asking if it's less than 0x00800000.
-  uint32_t aadj = (a << 1) + 0x00800000;
-  uint32_t badj = (b << 1) + 0x00800000;
-  if (aadj > 0xff800000)   // a is a signalling NaN?
-    return a | 0x00400000; //   if so, return it with the quiet bit set
-  if (badj > 0xff800000)   // b is a signalling NaN?
-    return b | 0x00400000; //   if so, return it with the quiet bit set
-  if (aadj < 0x00800000)   // a is a quiet NaN?
-    return a;              // if so, return it
-  return b;                // otherwise we expect b must be a quiet NaN
-}
diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c
deleted file mode 100644
index 29eba1cbde59d..0000000000000
--- a/compiler-rt/lib/builtins/arm/fnorm2.c
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This helper function is available for use by single-precision float
-// arithmetic implementations, to handle denormal inputs on entry by
-// renormalizing the mantissa and modifying the exponent to match.
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-// Structure containing the function's inputs and outputs.
-//
-// On entry: a, b are two input floating-point numbers, still in IEEE 754
-// encoding. expa and expb are the 8-bit exponents of those numbers, extracted
-// and shifted down to the low 8 bits of the word, with no other change.
-// Neither value should be zero, or have the maximum exponent (indicating an
-// infinity or NaN).
-//
-// On exit: each of a and b contains the mantissa of the input value, with the
-// leading 1 bit made explicit, and shifted up to the top of the word. If expa
-// was zero (indicating that a was denormal) then it is now represented as a
-// normalized number with an out-of-range exponent (zero or negative). The same
-// applies to expb and b.
-struct fnorm2 {
-  uint32_t a, b, expa, expb;
-};
-
-void __compiler_rt_fnorm2(struct fnorm2 *values) {
-  // Shift the mantissas of a and b to the right place to follow a leading 1 in
-  // the top bit, if there is one.
-  values->a <<= 8;
-  values->b <<= 8;
-
-  // Test if a is denormal.
-  if (values->expa == 0) {
-    // If so, decide how much further up to shift its mantissa, and adjust its
-    // exponent to match. This brings the leading 1 of the denormal mantissa to
-    // the top of values->a.
-    uint32_t shift = __builtin_clz(values->a);
-    values->a <<= shift;
-    values->expa = 1 - shift;
-  } else {
-    // Otherwise, leave the mantissa of a in its current position, and OR in
-    // the explicit leading 1.
-    values->a |= 0x80000000;
-  }
-
-  // Do the same operation on b.
-  if (values->expb == 0) {
-    uint32_t shift = __builtin_clz(values->b);
-    values->b <<= shift;
-    values->expb = 1 - shift;
-  } else {
-    values->b |= 0x80000000;
-  }
-}
diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c
deleted file mode 100644
index fd29e157328a3..0000000000000
--- a/compiler-rt/lib/builtins/arm/funder.c
+++ /dev/null
@@ -1,78 +0,0 @@
-//===-- funder.c - Handle single-precision floating-point underflow -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This helper function is available for use by single-precision float
-// arithmetic implementations to handle underflowed output values, if they were
-// computed in the form of a normalized mantissa and an out-of-range exponent.
-//
-// On input: x should be a complete IEEE 754 floating-point value representing
-// the desired output scaled up by 2^192 (the same value that would have been
-// passed to an underflow trap handler in IEEE 754:1985).
-//
-// This isn't enough information to re-round to the correct output denormal
-// without also knowing whether x itself has already been rounded, and which
-// way. 'errsign' gives this information, by indicating the sign of the value
-// (true result - x). That is, if errsign > 0 it means the true value was
-// larger (x was rounded down); if errsign < 0 then x was rounded up; if
-// errsign == 0 then x represents the _exact_ desired output value.
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-#define SIGNBIT 0x80000000
-#define MANTSIZE 23
-#define BIAS 0xc0
-
-uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) {
-  uint32_t sign = x & SIGNBIT;
-  uint32_t exponent = (x << 1) >> 24;
-
-  // Rule out exponents so small (or large!) that no denormalisation
-  // is needed.
-  if (exponent > BIAS) {
-    // Exponent 0xc1 or above means a normalised number got here by
-    // mistake, so we just remove the 0xc0 exponent bias and go
-    // straight home.
-    return x - (BIAS << MANTSIZE);
-  }
-  uint32_t bits_lost = BIAS + 1 - exponent;
-  if (bits_lost > MANTSIZE + 1) {
-    // The implicit leading 1 of the intermediate value's mantissa is
-    // below the lowest mantissa bit of a denormal by at least 2 bits.
-    // Round down to 0 unconditionally.
-    return sign;
-  }
-
-  // Make the full mantissa (with leading bit) at the top of the word.
-  uint32_t mantissa = 0x80000000 | (x << 8);
-  // Adjust by 1 depending on the sign of the error.
-  mantissa -= errsign >> 31;
-  mantissa += (-errsign) >> 31;
-
-  // Shift down to the output position, keeping the bits shifted off.
-  uint32_t outmant, shifted_off;
-  if (bits_lost == MANTSIZE + 1) {
-    // Special case for the exponent where we have to shift the whole
-    // of 'mantissa' off the bottom of the word.
-    outmant = 0;
-    shifted_off = mantissa;
-  } else {
-    outmant = mantissa >> (8 + bits_lost);
-    shifted_off = mantissa << (32 - (8 + bits_lost));
-  }
-
-  // Re-round.
-  if (shifted_off >> 31) {
-    outmant++;
-    if (!(shifted_off << 1))
-      outmant &= ~1; // halfway case: round to even
-  }
-
-  return sign | outmant;
-}
diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S
deleted file mode 100644
index b4f4c5e958c52..0000000000000
--- a/compiler-rt/lib/builtins/arm/mulsf3.S
+++ /dev/null
@@ -1,309 +0,0 @@
-//===-- mulsf3.S - single-precision floating point multiplication ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements single-precision soft-float multiplication with the
-// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32
-// assembly language suitable to be built as either Arm or Thumb2.
-//
-//===----------------------------------------------------------------------===//
-
-#include "../assembly.h"
-
-
-  .syntax unified
-  .text
-  .p2align 2
-
-DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
-
-DEFINE_COMPILERRT_FUNCTION(__mulsf3)
-
-  // Check if either input exponent is 00 or FF (i.e. not a normalized number),
-  // and if so, branch out of line. If we don't branch out of line, then we've
-  // also extracted the exponents of the input values r0/r1 into bits 16..23 of
-  // r2/r3. But if we do, then that hasn't necessarily been done (because the
-  // second AND might have been skipped).
-  mov     r12, #0xFF0000
-  ands    r2, r12, r0, lsr #7  // sets Z if exponent of x is 0
-  andsne  r3, r12, r1, lsr #7  // otherwise, sets Z if exponent of y is 0
-  teqne   r2, r12              // otherwise, sets Z if exponent of x is FF
-  teqne   r3, r12              // otherwise, sets Z if exponent of y is FF
-  beq     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
-
-  // Calculate the sign of the result, and put it in an unused bit of r2.
-  teq     r0, r1               // sets N to the XOR of x and y's sign bits
-  orrmi   r2, r2, #0x100       // if N set, set bit 8 of r2
-
-  // Move the input mantissas to the high end of r0/r1, each with its leading
-  // bit set explicitly, so that they're in the right form to be multiplied.
-  mov     r12, #0x80000000
-  orr     r0, r12, r0, lsl #8
-  orr     r1, r12, r1, lsl #8
-
-  // Now we're ready to multiply mantissas. This is also the place we'll come
-  // back to after decoding denormal inputs. The denormal decoding will also
-  // have to set up the same register contents:
-  //  - decoded fractions at the top of r0 and r1
-  //  - exponents in r2 and r3, starting at bit 16
-  //  - output sign in r2 bit 8
-LOCAL_LABEL(mul):
-
-  // Here we multiply the mantissas, and compute the output exponent by adding
-  // the input exponents and rebiasing. These operations are interleaved to
-  // use a delay slot.
-  //
-  // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd
-  // expect. That compensates for the leading bit of the mantissa overlapping
-  // it, when we recombine the exponent and mantissa by addition.
-  add     r2, r2, r3           // r2 has sum of exponents, freeing up r3
-  umull   r1, r3, r0, r1       // r3:r1 has the double-width product
-  sub     r2, r2, #(0x80 << 16) // rebias the summed exponent
-
-  // Compress the double-word product into just the high-order word r3, by
-  // setting its bit 0 if any bit of the low-order word is nonzero. This
-  // changes the represented value, but not by nearly enough to affect
-  // rounding, because rounding only depends on the bit below the last output
-  // bit, and the general question of whether _any_ nonzero bit exists below
-  // that.
-  cmp     r1, #0                // if low word of full product is nonzero
-  orrne   r3, r3, #1            //   then set LSB of high word
-
-  // The two inputs to UMULL had their high bits set, that is, were at least
-  // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e.
-  // the high bit of the product could be at the top of the word or one bit
-  // below. Check which, by experimentally shifting left, and then undoing it
-  // via RRX if we turned out to have shifted off a 1 bit.
-  lsls    r3, r3, #1            // shift left, setting C to the bit shifted off
-  rrxcs   r3, r3                // if that bit was 1, put it back again
-
-  // That ensured the leading 1 bit of the product is now the top of r3, but
-  // also, set C if the leading 1 was _already_ in the top bit. So now we know
-  // whether to increment the exponent. The following instruction does the
-  // conditional increment (because it's ADC), but also, copies the exponent
-  // field from bit 16 of r2 into bit 0, so as to place it just below the
-  // output sign bit.
-  //
-  // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2
-  // are exactly what we need to combine with the rounded mantissa. But the
-  // full output exponent (with extra bits) is still available in the high half
-  // of r2, so that we can check _whether_ we overflowed or underflowed.
-  adc     r2, r2, r2, asr #16
-
-  // Recombine the exponent and mantissa, doing most of the rounding as a side
-  // effect: we shift the mantissa right so as to put the round bit into C, and
-  // then we recombine with the exponent using ADC, to increment the mantissa
-  // if C was set.
-  movs    r12, r3, lsr #8
-  adc     r0, r12, r2, lsl #23
-
-  // To complete the rounding, we must check for the round-to-even tiebreaking
-  // case, by checking if we're in the exact halfway case, which occurs if and
-  // only if we _did_ round up (we can tell this because C is still set from
-  // the MOVS), and also, no bit of r3 is set _below_ the round bit.
-  //
-  // We combine this with an overflow check, so that C ends up set if anything
-  // weird happened, and clear if we're completely finished and can return.
-  //
-  // The best instruction sequence for this part varies between Arm and Thumb.
-#if !__thumb__
-  // Arm state: if C was set then we check the low bits of r3, so that Z ends
-  // up set if we need to round to even.
-  //
-  // (We rely here on Z reliably being clear to begin with, because shifting
-  // down the output mantissa definitely gave a nonzero output. Also, the TST
-  // doesn't change C, so if Z does end up set, then C was also set.)
-  //
-  // Then, if we're not rounding to even, we do a CMP which sets C if there's
-  // been an overflow or an underflow. An overflow could occur for an output
-  // exponent as low as 0xFC, because we might increment the exponent by 1 when
-  // renormalizing, by another when recombining with the mantissa, and by one
-  // more if rounding up causes a carry off the top of the mantissa. An
-  // underflow occurs only if the output exponent is negative (because it's
-  // offset by 1, so an exponent of 0 will be incremented to 1), in which case
-  // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to
-  // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also
-  // catches a few very large cases that _don't_ quite overflow (exponents of
-  // 0xFC and above that don't get maximally unlucky); those will also be
-  // handled by the slow path.
-  tstcs   r3, #0x7F
-  cmpne   r2, #0xFC0000
-#else
-  // In Thumb, switching between different conditions has a higher cost due to
-  // the (implicit in this code) IT instructions, so we prefer a strategy that
-  // uses CC and CS conditions throughout, at the cost of requiring some extra
-  // cleanup instructions on the slow path.
-  //
-  // If C is set (and hence round-to-even is a possibility), the basic idea is
-  // to shift the full result word (r3) left by 25, leaving only its bottom 7
-  // bits, which are now the top 7 bits; then we want to set C iff these are 0.
-  //
-  // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this
-  // could be done in one instruction if only we had a register to use as x,
-  // which has 0 in the top 7 bits and at least one nonzero. Then we could
-  // compare that against the shifted-up value of r3, setting C precisely if
-  // the top 7 bits of y are greater than 0. And happily, we _do_ have such a
-  // register! r12 contains the shifted-down mantissa, which is guaranteed to
-  // have a 1 in bit 23, and 0 above that.
-  //
-  // The shift of r3 happens only in the second operand of the compare, so we
-  // don't lose the original value of r3 in this process.
-  //
-  // The check for over/underflow is exactly as in the Arm branch above, except
-  // based on a different condition.
-  cmpcs   r12, r3, lsl #25  // now C is set iff we're rounding to even
-  cmpcc   r2, #0xFC0000     // and now it's also set if we've over/underflowed
-#endif
-
-  // That's all the checks for difficult cases done. If C is clear, we can
-  // return.
-  bxcc    lr
-
-  // Now the slower path begins. We have to recover enough information to
-  // handle all of round-to-even, overflow and underflow.
-  //
-  // Round to even is the most likely of these, so we detect it first and
-  // handle it as fast as possible.
-
-#if __thumb__
-  // First, Thumb-specific compensation code. The Arm branch of the #if above
-  // will have set Z=0 to indicate round to even, but the Thumb branch didn't
-  // leave any unambiguous indicator of RTE, so we must retest by checking all
-  // the bits shifted off the bottom of the mantissa to see if they're exactly
-  // the half-way value.
-  lsl     r12, r3, #24           // r12 = round bit and everything below
-  cmp     r12, #0x80000000       // set Z if that is exactly 0x80000000
-#endif
-
-  // Now Z is clear iff we have already rounded up and now must replace that
-  // with rounding to even, which is done by just clearing the low bit of the
-  // mantissa.
-  biceq   r0, r0, #1
-
-  // Redo the over/underflow check (the same way as in both branches above),
-  // and if it doesn't report a danger, we can return the rounded-to-even
-  // answer.
-  cmp     r2, #0xFC0000         // check for over/underflow
-  bxcc    lr                    // and return if none.
-
-  // Now we only have overflow and underflow left to handle. First, find out
-  // which we're looking at. This is easy by testing the top bit of r2, but
-  // even easier by using the fact that the possible positive and negative
-  // values of r2 are widely enough separated that the 0xFC0000 subtracted by
-  // the CMP above won't have made any difference. So the N flag output from
-  // that comparison _already_ tells us which condition we have: if N is set we
-  // have underflow, and if N is clear, overflow.
-  bpl     LOCAL_LABEL(overflow)
-
-  // Here we're handling underflow.
-
-  // Add the IEEE 754:1985 exponent bias which funder will expect. This also
-  // brings the exponent back into a range where it can't possibly have carried
-  // into the sign bit, so the output sign will now be right.
-  add     r0, r0, #(0xC0 << 23)
-
-  // Determine whether we rounded up, down or not at all.
-  lsls    r2, r3, #1              // input mantissa, without its leading 1
-  subs    r1, r2, r0, lsl #9      // subtract the output mantissa (likewise)
-
-  // And let funder handle the rest.
-  b     SYMBOL_NAME(__compiler_rt_funder)
-
-LOCAL_LABEL(overflow):
-  // We come here to handle overflow, but it's not guaranteed that an overflow
-  // has actually happened: our check on the fast path erred on the side of
-  // caution, by catching any output exponent that _could_ cause an overflow.
-  // So first check whether this really is an overflow, by extracting the
-  // output exponent. Exponent 0xFF, or anything that wrapped round to having
-  // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows.
-  //
-  // The value in r0 is correct to return, if there's no overflow.
-  add     r12, r0, #(1 << 23)     // add 1 to the exponent so 0xFF wraps to 0
-  movs    r12, r12, lsl #1        // test the top bit of the modified value
-  bxmi    lr                      // if top bit is still 1, not an overflow
-
-  // This is an overflow, so we need to replace it with an appropriately signed
-  // infinity. First we correct the sign by applying a downward bias to the
-  // exponent (the one suggested in IEEE 754:1985, which was chosen to bring
-  // all possible overflowed results back into range).
-  subs    r0, r0, #(0xC0 << 23)
-
-  // Now the sign bit of r0 is correct. Replace everything else with the
-  // encoding of an infinity.
-  mov     r1, #0xFF
-  and     r0, r0, #0x80000000
-  orr     r0, r0, r1, lsl #23
-  bx      lr
-
-LOCAL_LABEL(uncommon):
-  // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that
-  // we've at least done the first _two_ instructions from the entry point,
-  // even if all the rest were skipped. So r2 contains the sign and exponent of
-  // x in bits 16..23, and r12 = 0xFF << 16.
-  //
-  // So, first repeat some instructions from the prologue, which were either
-  // conditionally skipped in the sequence leading to the branch, or skipped
-  // because they happened after the branch.
-  and     r3, r12, r1, lsr #7  // get exponent of y in r3 bits 16..23
-  teq     r0, r1               // calculate the sign of the result
-  orrmi   r2, r2, #0x100       // and put it in bit 8 of r2 as before
-
-  // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at
-  // least 0xFF0000 (hence the exponent field is equal to 0xFF).
-  cmp     r2, r12
-  cmplo   r3, r12
-  bhs     LOCAL_LABEL(inf_NaN)
-
-  // If we didn't take that branch, then we have only finite numbers, but at
-  // least one is denormal or zero. A zero makes the result easy (and also is a
-  // more likely input than a denormal), so check those first, as fast as
-  // possible.
-  movs    r12, r0, lsl #1          // Z set if x == 0
-  movsne  r12, r1, lsl #1          // now Z set if either input is 0
-  moveq   r0, r2, lsl #23          // in either case, make 0 of the output sign
-  bxeq    lr                       // and return it
-
-  // Now we know we only have denormals to deal with. Call fnorm2 to sort
-  // them out, and rejoin the main code path above.
-  and     r12, r2, #0x100          // save the result sign from r2
-  lsr     r2, #16                  // shift extracted exponents down to bit 0
-  lsr     r3, #16                  // where fnorm2 will expect them
-  push    {r0, r1, r2, r3, r12, lr}
-  mov     r0, sp                   // tell fnorm2 where to find its data
-  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
-  pop     {r0, r1, r2, r3, r12, lr}
-  lsl     r3, #16                  // shift exponents back up to bit 16
-  orr     r2, r12, r2, lsl #16     // and put the result sign back in r2
-  b       LOCAL_LABEL(mul)
-
-LOCAL_LABEL(inf_NaN):
-  // We come here if at least one input is a NaN or infinity. If either or both
-  // inputs are NaN then we hand off to fnan2 which will propagate a NaN from
-  // the input; otherwise any multiplication involving infinity returns
-  // infinity, unless it's infinity * 0 which is an invalid operation and
-  // returns NaN again.
-  mov     r12, #0xFF000000
-  cmp     r12, r0, lsl #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
-  blo     SYMBOL_NAME(__compiler_rt_fnan2)
-  cmp     r12, r1, lsl #1
-  blo     SYMBOL_NAME(__compiler_rt_fnan2)
-
-  // NaNs are dealt with, so now we have at least one infinity. Check if the
-  // other operand is 0. This is conveniently done by XORing the two: because
-  // we know that the low 31 bits of one operand are exactly 0x7F800000, we can
-  // test if the low 31 bits of the other one are all 0 by checking whether the
-  // low 31 bits of (x XOR y) equal 0x7F800000.
-  eor     r3, r0, r1
-  cmp     r12, r3, lsl #1          // if inf * 0, this sets Z
-  lsr     r0, r12, #1              // set up return value of +infinity
-  orrne   r0, r0, r2, lsl #23      // if not inf * 0, put on the output sign
-  orreq   r0, r0, #0x400000        // otherwise, set the 'quiet NaN' bit
-  bx      lr                       // and return
-
-END_COMPILERRT_FUNCTION(__mulsf3)
-
-NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
deleted file mode 100644
index f2ede1013a9e6..0000000000000
--- a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
+++ /dev/null
@@ -1,251 +0,0 @@
-//===-- mulsf3.S - single-precision floating point multiplication ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements single-precision soft-float multiplication with the
-// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1
-// assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#include "../../assembly.h"
-
-  .syntax unified
-  .text
-  .thumb
-  .p2align 2
-
-DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
-
-DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3)
-  push {r4,r5,r6,lr}
-
-  // Get exponents of the inputs, and check for uncommon values. In the process
-  // of this we also compute the sign, because it's marginally quicker that
-  // way.
-  lsls    r2, r0, #1
-  adcs    r4, r4, r4    // set r4[0] to sign bit of x
-  lsls    r3, r1, #1
-  adcs    r4, r4, r3    // set r4[0] to the output sign
-  lsrs    r2, r2, #24
-  beq     LOCAL_LABEL(zerodenorm0)   // still do the next LSRS
-  lsrs    r3, r3, #24
-  beq     LOCAL_LABEL(zerodenorm)
-  cmp     r2, #255
-  beq     LOCAL_LABEL(naninf)
-  cmp     r3, #255
-  beq     LOCAL_LABEL(naninf)
-  // Compute the output exponent. We'll be generating our product _without_ the
-  // leading bit, so we subtract 0x7f rather than 0x80.
-  adds    r2, r2, r3
-  subs    r2, r2, #0x7f
-  // Blank off everything above the mantissas.
-  lsls    r0, r0, #9
-  lsls    r1, r1, #9
-LOCAL_LABEL(normalised): // we may come back here from zerodenorm
-  lsrs    r0, r0, #9
-  lsrs    r1, r1, #9
-  // Multiply. r0 and r1 are the mantissas of the inputs but without their
-  // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23).
-  // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half.
-  //
-  // The technique below will actually compute P - 2^46, by not adding on the
-  // term where the two 2^23 are multiplied. The 48-bit result will be
-  // delivered in two output registers, one containing its bottom 32 bits and
-  // the other containing the top 32, so they overlap in the middle 16 bits.
-  // This is done using only two multiply instructions and some bookkeeping.
-  //
-  // In the comments I'll write X and Y for the original input mantissas (again
-  // without their leading bits). I'll also decompose them as X = xh + xl and
-  // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are
-  // multiples of 2^8.
-  adds    r5, r0, r1
-  lsls    r5, r5, #7    // r5 = (X+Y) << 7
-  movs    r6, r0
-  muls    r6, r1, r6    // r6 is congruent mod 2^32 to X*Y
-  lsrs    r0, r0, #8
-  lsrs    r1, r1, #8
-  muls    r0, r1, r0
-  lsls    r1, r0, #16   // r1 is congruent mod 2^32 to xh*yh
-  subs    r3, r6, r1    // now r3 is congruent mod 2^32 to
-                        //   (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl
-                        //   and hence, since that is at most 0xfeff0001,
-                        //   is _exactly_ equal to that
-  adds    r0, r0, r5    // r0 is now (xh*yh + (X+Y)<<23) >> 16
-  lsrs    r1, r3, #16   // r1 is the top 16 bits of r3, i.e.
-                        //   (xh*yl + xl*yh + xl*yl) >> 16
-  adds    r3, r0, r1    // now r3 equals
-                        //   (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16
-                        //   i.e. (X*Y + (X+Y)<<23) >> 16,
-                        //   i.e. (the right answer) >> 16.
-                        // Meanwhile, r6 is exactly the bottom 32 bits of the
-                        // right answer.
-  // Renormalise if necessary.
-  lsrs    r1, r3, #30
-  beq     LOCAL_LABEL(norenorm)
-  // Here we have to do something fiddly. Renormalisation would be a trivial
-  // job if we had the leading mantissa bit - just note that it's one bit
-  // position above where it should be, and shift right by one. But without
-  // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just
-  // shifting right would of course give us (x - 2^29), so we must subtract an
-  // extra 2^29 to fix this up.
-  lsrs    r3, r3, #1
-  movs    r1, #1
-  lsls    r1, r1, #29
-  subs    r3, r3, r1
-  adds    r2, r2, #1
-LOCAL_LABEL(norenorm):
-  // Round and shift down to the right bit position.
-  lsrs    r0, r3, #7    // round bit goes into the carry flag
-  bcc     LOCAL_LABEL(rounded)
-  adds    r0, r0, #1
-  // In the round-up branch, we must also check if we have to round to even, by
-  // testing all the bits below the round bit. We will normally not expect to,
-  // so we do RTE by branching out of line and back again to avoid spending a
-  // branch in the common case.
-  lsls    r5, r3, #32-7+1  // check the bits shifted out of r3 above
-  bne     LOCAL_LABEL(rounded)          // if any is nonzero, we're not rounding to even
-  lsls    r5, r6, #15      // check the bottom 17 bits of the low-order 32
-                           //   (enough to overlap r3 even if we renormalised)
-  beq     LOCAL_LABEL(rte)              // if any is nonzero, fall through, else RTE
-LOCAL_LABEL(rounded):
-  // Put on the sign and exponent, check for underflow and overflow, and
-  // return.
-  //
-  // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if
-  // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since
-  // this code doesn't report exceptions, we can ignore this case because it'll
-  // happen to return the right answer regardless). So we handle most of this
-  // via an unsigned comparison against 0xFF, which leaves the one case of a
-  // zero exponent that we have to filter separately by testing the Z flag
-  // after we shift the exponent back up into place.
-  cmp     r2, #0xFF    // check for most over/underflows
-  bhs     LOCAL_LABEL(outflow)      // ... and branch out of line for them
-  lsls    r5, r2, #23  // shift the exponent into its output location
-  beq     LOCAL_LABEL(outflow)      // ... and branch again if it was 0
-  lsls    r4, r4, #31  // shift the output sign into place
-  orrs    r0, r0, r4   // and OR it in to the output
-  adds    r0, r0, r5   // OR in the mantissa
-  pop     {r4,r5,r6,pc} // and return
-
-LOCAL_LABEL(rte):
-  // Out-of-line handler for the round-to-even case. Clear the low mantissa bit
-  // and go back to the post-rounding code.
-  movs    r5, #1
-  bics    r0, r0, r5
-  b       LOCAL_LABEL(rounded)
-
-LOCAL_LABEL(outflow):
-  cmp     r2, #0
-  bgt     LOCAL_LABEL(overflow)
-  // To handle underflow, we construct an intermediate value in the IEEE 754
-  // style (using our existing full-length mantissa, and bias the exponent by
-  // +0xC0), and indicate whether that intermediate was rounded up, down or not
-  // at all. Then call the helper function funder, which will denormalise and
-  // re-round correctly.
-  lsls    r1, r0, #7    // shift up the post-rounding mantissa
-  subs    r1, r3, r1    //   and subtract it from the pre-rounding version
-  lsls    r6, r6, #15
-  cmp     r6, #1        // if the rest of the low bits are nonzero
-  adcs    r1, r1, r1    //   then set an extra bit at the bottom
-
-  lsls    r4, r4, #31
-  orrs    r0, r0, r4    // put on the sign
-  adds    r2, r2, #192  // bias the exponent
-  lsls    r3, r2, #23
-  adds    r0, r0, r3    // put on the biased exponent
-
-  bl      SYMBOL_NAME(__compiler_rt_funder)
-  pop     {r4,r5,r6,pc}
-
-LOCAL_LABEL(overflow):
-  // Handle overflow by returning an infinity of the correct sign.
-  lsls    r4, r4, #8    // move the sign up to bit 8
-  movs    r0, #0xff
-  orrs    r0, r0, r4    // fill in an exponent just below it
-  lsls    r0, r0, #23   // and shift those 9 bits up to the top of the word
-  pop     {r4,r5,r6,pc}
-
-  // We come here if there's at least one zero or denormal. On the fast path
-  // above, it was convenient to check these before checking NaNs and
-  // infinities, but NaNs take precedence, so now we're off the fast path, we
-  // must still check for those.
-  //
-  // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input
-  // exponents. So if we branched after shifting-and-checking r2, we come to
-  // this earlier entry point 'zerodenorm0' so that we still shift r3.
-LOCAL_LABEL(zerodenorm0):
-  lsrs    r3, r3, #24
-LOCAL_LABEL(zerodenorm):
-  cmp     r2, #255
-  beq     LOCAL_LABEL(naninf)
-  cmp     r3, #255
-  beq     LOCAL_LABEL(naninf)
-  // Now we know we have at least one zero or denormal, and no NaN or infinity.
-  // Check if either input is actually zero. We've ruled out 0 * infinity by
-  // this point, so any zero input means we return zero of the correct sign.
-  lsls    r6, r0, #1        // is one input zero?
-  beq     LOCAL_LABEL(zero)              // yes, go and return zero
-  lsls    r6, r1, #1        // is the other one zero?
-  bne     LOCAL_LABEL(denorm)            // if not, one must have been a denormal
-LOCAL_LABEL(zero):
-  lsls    r0, r4, #31    // shift up the output sign to make the return value
-  pop     {r4,r5,r6,pc}
-
-  // Handle denormals via the helper function fnorm2, which will break both
-  // inputs up into mantissa and exponent, renormalising and generating a
-  // negative exponent if necessary.
-LOCAL_LABEL(denorm):
-  push    {r0,r1,r2,r3}
-  mov     r0, sp
-  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
-  pop     {r0,r1,r2,r3}
-  // Convert fnorm2's return values into the right form to rejoin the main
-  // code path.
-  lsls    r0, r0, #1
-  lsls    r1, r1, #1
-  adds    r2, r2, r3
-  subs    r2, r2, #0x7f
-  b       LOCAL_LABEL(normalised)
-
-  // We come here if at least one input is a NaN or infinity. There may still
-  // be zeroes (or denormals, though they make no difference at this stage).
-LOCAL_LABEL(naninf):
-  movs    r6, #0xff
-  lsls    r6, r6, #24
-  lsls    r5, r0, #1
-  cmp     r5, r6
-  bhi     LOCAL_LABEL(nan)              // first operand is a NaN
-  lsls    r5, r1, #1
-  cmp     r5, r6
-  bhi     LOCAL_LABEL(nan)              // second operand is a NaN
-
-  // We know we have at least one infinity, and no NaNs. We might also have a
-  // zero, in which case we return the default quiet NaN.
-  lsls    r6, r0, #1
-  beq     LOCAL_LABEL(infzero)          // if r0 is a zero, r1 must be inf
-  lsls    r6, r1, #1
-  beq     LOCAL_LABEL(infzero)          // if r1 is a zero, r0 must be inf
-  // Otherwise we have infinity * infinity, or infinity * finite. Just return
-  // an appropriately signed infinity.
-  b       LOCAL_LABEL(overflow)         // reuse the code there
-
-  // We come here if at least one input is a NaN. Hand off to fnan2, which
-  // propagates an appropriate NaN to the output, dealing with the special
-  // cases of signalling/quiet NaNs.
-LOCAL_LABEL(nan):
-  bl      SYMBOL_NAME(__compiler_rt_fnan2)
-  pop     {r4,r5,r6,pc}
-
-  // Return a quiet NaN as the result of infinity * zero.
-LOCAL_LABEL(infzero):
-  ldr     r0, =0x7fc00000
-  pop     {r4,r5,r6,pc}
-
-END_COMPILERRT_FUNCTION(__mulsf3)
-
-NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 8e3cb35183ba7..63f4c94605c90 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -35,10 +35,6 @@ if(APPLE)
   darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH)
 endif()
 
-if(COMPILER_RT_ARM_OPTIMIZED_FP)
-  list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP)
-endif()
-
 foreach(arch ${BUILTIN_TEST_ARCH})
   set(BUILTINS_TEST_TARGET_ARCH ${arch})
   string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX)
diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c
index 12c5df5fdaae1..f8cb6169ac283 100644
--- a/compiler-rt/test/builtins/Unit/divsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/divsf3_test.c
@@ -1,428 +1,115 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_divsf3
 
 #include "int_lib.h"
-#include <inttypes.h>
 #include <stdio.h>
 
 #include "fp_test.h"
 
-// By default this test uses compareResultF to check the returned floats, which
-// accepts any returned NaN if the expected result is the canonical NaN value
-// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
-// detailed handling of NaNs, we tighten up the check and include some extra
-// test cases specific to that NaN policy.
-#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
-#  define EXPECT_EXACT_RESULTS
-#  define ARM_NAN_HANDLING
-#endif
-
 // Returns: a / b
 COMPILER_RT_ABI float __divsf3(float a, float b);
 
-int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
-  float a = fromRep32(a_rep), b = fromRep32(b_rep);
-  float x = __divsf3(a, b);
-#ifdef EXPECT_EXACT_RESULTS
-  int ret = toRep32(x) == expected_rep;
-#else
-  int ret = compareResultF(x, expected_rep);
-#endif
+int test__divsf3(float a, float b, uint32_t expected)
+{
+    float x = __divsf3(a, b);
+    int ret = compareResultF(x, expected);
 
-  if (ret) {
-    printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
-           ", expected %08" PRIx32 "\n",
-           a_rep, b_rep, toRep32(x), expected_rep);
-  }
-  return ret;
+    if (ret){
+        printf("error in test__divsf3(%.20e, %.20e) = %.20e, "
+               "expected %.20e\n", a, b, x,
+               fromRep32(expected));
+    }
+    return ret;
 }
 
-int main(void) {
-  int status = 0;
+int main()
+{
+    // Returned NaNs are assumed to be qNaN by default
+
+    // qNaN / any = qNaN
+    if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000)))
+      return 1;
+    // NaN / any = NaN
+    if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000)))
+      return 1;
+    // any / qNaN = qNaN
+    if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000)))
+      return 1;
+    // any / NaN = NaN
+    if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000)))
+      return 1;
+
+    // +Inf / positive = +Inf
+    if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000)))
+      return 1;
+    // +Inf / negative = -Inf
+    if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000)))
+      return 1;
+    // -Inf / positive = -Inf
+    if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000)))
+      return 1;
+    // -Inf / negative = +Inf
+    if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000)))
+      return 1;
+
+    // Inf / Inf = NaN
+    if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000)))
+      return 1;
+    // 0.0 / 0.0 = NaN
+    if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000)))
+      return 1;
+    // +0.0 / +Inf = +0.0
+    if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0)))
+      return 1;
+    // +Inf / +0.0 = +Inf
+    if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000)))
+      return 1;
+
+    // positive / +0.0 = +Inf
+    if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000)))
+      return 1;
+    // positive / -0.0 = -Inf
+    if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000)))
+      return 1;
+    // negative / +0.0 = -Inf
+    if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000)))
+      return 1;
+    // negative / -0.0 = +Inf
+    if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000)))
+      return 1;
+
+    // 1/3
+    if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab)))
+      return 1;
+    // smallest normal result
+    if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000)))
+      return 1;
 
-  status |= test__divsf3(0x00000000, 0x00000001, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x00800000, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x00000000, 0x80000002, 0x80000000);
-  status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000);
-  status |= test__divsf3(0x00000000, 0x80800001, 0x80000000);
-  status |= test__divsf3(0x00000000, 0x81000000, 0x80000000);
-  status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000);
-  status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000);
-  status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000);
-  status |= test__divsf3(0x00000000, 0xff000000, 0x80000000);
-  status |= test__divsf3(0x00000000, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008);
-  status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002);
-  status |= test__divsf3(0x00000001, 0x40000000, 0x00000000);
-  status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000);
-  status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000);
-  status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000);
-  status |= test__divsf3(0x00000002, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x00000002, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x00000009, 0x41100000, 0x00000001);
-  status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001);
-  status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8);
-  status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff);
-  status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe);
-  status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe);
-  status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff);
-  status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd);
-  status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff);
-  status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff);
-  status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff);
-  status |= test__divsf3(0x00800001, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x00800001, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc);
-  status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff);
-  status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000);
-  status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x01000000, 0x00800000, 0x40000000);
-  status |= test__divsf3(0x01000000, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000);
-  status |= test__divsf3(0x01000000, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x01000001, 0x00800001, 0x40000000);
-  status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001);
-  status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000);
-  status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003);
-  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc);
-  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9);
-  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc);
-  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb);
-  status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001);
-  status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001);
-  status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002);
-  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff);
-  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe);
-  status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd);
-  status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa);
-  status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002);
-  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001);
-  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff);
-  status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe);
-  status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb);
-  status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9);
-  status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003);
-  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001);
-  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001);
-  status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff);
-  status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc);
-  status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa);
-  status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8);
-  status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003);
-  status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002);
-  status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001);
-  status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001);
-  status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd);
-  status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb);
-  status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9);
-  status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7);
-  status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005);
-  status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004);
-  status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003);
-  status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002);
-  status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002);
-  status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001);
-  status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001);
-  status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000);
-  status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe);
-  status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc);
-  status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa);
-  status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8);
-  status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004);
-  status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003);
-  status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002);
-  status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002);
-  status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe);
-  status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc);
-  status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004);
-  status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004);
-  status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003);
-  status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003);
-  status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001);
-  status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe);
-  status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005);
-  status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004);
-  status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004);
-  status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002);
-  status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005);
-  status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003);
-  status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa);
-  status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006);
-  status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc);
-  status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005);
-  status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001);
-  status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000);
-  status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000);
-  status |= test__divsf3(0x40400000, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000);
-  status |= test__divsf3(0x40400000, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000);
-  status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x41000000, 0x40000000, 0x40800000);
-  status |= test__divsf3(0x41100000, 0x40400000, 0x40400000);
-  status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000);
-  status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd);
-  status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000);
-  status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000);
-  status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000);
-  status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000);
-  status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000);
-  status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000);
-  status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000);
-  status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd);
-  status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd);
-  status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000);
-  status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000);
-  status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000);
-  status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000);
-  status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff);
-  status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000);
-  status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000);
-  status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000);
-  status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000);
-  status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000);
-  status |= test__divsf3(0x80000000, 0x00000003, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x00800001, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x01000000, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x40000000, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0x80000000, 0x80000004, 0x00000000);
-  status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000);
-  status |= test__divsf3(0x80000000, 0x80800000, 0x00000000);
-  status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000);
-  status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000);
-  status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000);
-  status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000);
-  status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000);
-  status |= test__divsf3(0x80000000, 0xff800000, 0x00000000);
-  status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002);
-  status |= test__divsf3(0x80000001, 0x40000000, 0x80000000);
-  status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000);
-  status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000);
-  status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000);
-  status |= test__divsf3(0x80000003, 0x00000000, 0xff800000);
-  status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0x80000004, 0xff800000, 0x00000000);
-  status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9);
-  status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000);
-  status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000);
-  status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff);
-  status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0x80800000, 0xff800000, 0x00000000);
-  status |= test__divsf3(0x80800001, 0x00000000, 0xff800000);
-  status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000);
-  status |= test__divsf3(0x81000000, 0x00000000, 0xff800000);
-  status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000);
-  status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000);
-  status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000);
-  status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000);
-  status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000);
-  status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000);
-  status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000);
-  status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000);
-  status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000);
-  status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000);
-  status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000);
-  status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000);
-  status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000);
-  status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000);
-  status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000);
-  status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000);
-  status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000);
-  status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000);
-  status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff);
-  status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000);
-  status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000);
-  status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000);
-  status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd);
-  status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd);
-  status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000);
-  status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000);
-  status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000);
-  status |= test__divsf3(0xff800000, 0x00000000, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x00000003, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x00800001, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x01000000, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x40000000, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000);
-  status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000);
-  status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000);
-  status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953);
-  status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781);
+    // divisor is exactly 1.0
+    if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000)))
+      return 1;
+    // divisor is truncated to exactly 1.0 in UQ1.15
+    if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00)))
+      return 1;
 
-  // Test that the result of an operation is a NaN at all when it should be.
-  //
-  // In most configurations these tests' results are checked compared using
-  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
-  // which causes compareResultF to accept any NaN encoding. We also use the
-  // same value as the input NaN in tests that have one, so that even in
-  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
-  // still the exact expected NaN.
-  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
-  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
-  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
-  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
-  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
-  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
-  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
-  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
-  status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
-  status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
-  status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+    // smallest normal value divided by 2.0
+    if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000)))
+      return 1;
+    // smallest subnormal result
+    if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001)))
+      return 1;
 
-#ifdef ARM_NAN_HANDLING
-  // Tests specific to the NaN handling of Arm hardware, mimicked by
-  // arm/divsf3.S:
-  //
-  //  - a quiet NaN is distinguished by the top mantissa bit being 1
-  //
-  //  - if a signalling NaN appears in the input, the output quiet NaN is
-  //    obtained by setting its top mantissa bit and leaving everything else
-  //    unchanged
-  //
-  //  - if both operands are signalling NaNs then the output NaN is derived
-  //    from the first operand
-  //
-  //  - if both operands are quiet NaNs then the output NaN is the first
-  //    operand
-  //
-  //  - invalid operations not involving an input NaN return the quiet
-  //    NaN with fewest bits set, 0x7fc00000.
+    // some misc test cases obtained by fuzzing against h/w implementation
+    if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006)))
+      return 1;
+    if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000)))
+      return 1;
+    if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000)))
+      return 1;
 
-  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
-  status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
-  status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
-  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
-  status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
-  status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716);
-  status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
-  status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
-  status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
-  status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
-  status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
-  status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
-  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
-  status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
-  status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
-  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
-  status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
-  status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
-  status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
-  status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
-  status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
-  status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
-  status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
-  status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
-  status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
-  status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
-  status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
-  status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
-  status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
-  status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
-  status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
-  status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
-  status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
-  status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
-  status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
-  status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
-  status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
-  status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
-  status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
-  status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
-  status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
-  status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
-  status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
-  status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329);
-  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
-  status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
-  status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
-  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
-  status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
-  status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
-  status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
-  status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
-  status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
-  status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
-  status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
-  status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
-  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
-  status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
-  status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397);
-  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
-#endif // ARM_NAN_HANDLING
+    // test 1 / (1 - eps(0.5)) = 1 + eps(1)
+    if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001)))
+      return 1;
 
-  return status;
+    return 0;
 }
diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c
deleted file mode 100644
index 7dc7c8ad39c32..0000000000000
--- a/compiler-rt/test/builtins/Unit/mulsf3_test.c
+++ /dev/null
@@ -1,616 +0,0 @@
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// RUN: %clang_builtins %s %librt -o %t && %run %t
-// REQUIRES: librt_has_mulsf3
-
-#include "int_lib.h"
-#include <inttypes.h>
-#include <stdio.h>
-
-#include "fp_test.h"
-
-// By default this test uses compareResultF to check the returned floats, which
-// accepts any returned NaN if the expected result is the canonical NaN value
-// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
-// detailed handling of NaNs, we tighten up the check and include some extra
-// test cases specific to that NaN policy.
-#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
-#  define EXPECT_EXACT_RESULTS
-#  define ARM_NAN_HANDLING
-#endif
-
-// Returns: a * b
-COMPILER_RT_ABI float __mulsf3(float a, float b);
-
-int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
-  float a = fromRep32(a_rep), b = fromRep32(b_rep);
-  float x = __mulsf3(a, b);
-#ifdef EXPECT_EXACT_RESULTS
-  int ret = toRep32(x) == expected_rep;
-#else
-  int ret = compareResultF(x, expected_rep);
-#endif
-
-  if (ret) {
-    printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
-           ", expected %08" PRIx32 "\n",
-           a_rep, b_rep, toRep32(x), expected_rep);
-  }
-  return ret;
-}
-
-int main(void) {
-  int status = 0;
-
-  status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000);
-  status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000);
-  status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000);
-  status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000);
-  status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000);
-  status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000);
-  status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000);
-  status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000);
-  status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000);
-  status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000);
-  status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001);
-  status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001);
-  status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002);
-  status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000);
-  status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001);
-  status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003);
-  status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003);
-  status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001);
-  status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000);
-  status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9);
-  status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000);
-  status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9);
-  status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000);
-  status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8);
-  status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc);
-  status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff);
-  status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff);
-  status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000);
-  status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000);
-  status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe);
-  status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000);
-  status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000);
-  status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff);
-  status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000);
-  status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000);
-  status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000);
-  status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe);
-  status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000);
-  status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001);
-  status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000);
-  status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc);
-  status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff);
-  status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff);
-  status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000);
-  status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000);
-  status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000);
-  status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd);
-  status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003);
-  status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000);
-  status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000);
-  status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001);
-  status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009);
-  status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002);
-  status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002);
-  status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003);
-  status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003);
-  status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000);
-  status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000);
-  status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000);
-  status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001);
-  status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000);
-  status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000);
-  status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000);
-  status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff);
-  status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa);
-  status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003);
-  status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005);
-  status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000);
-  status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd);
-  status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003);
-  status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd);
-  status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe);
-  status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe);
-  status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd);
-  status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006);
-  status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000);
-  status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000);
-  status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000);
-  status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003);
-  status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006);
-  status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006);
-  status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008);
-  status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff);
-  status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff);
-  status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe);
-  status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000);
-  status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000);
-  status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002);
-  status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc);
-  status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe);
-  status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000);
-  status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000);
-  status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003);
-  status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd);
-  status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd);
-  status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000);
-  status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000);
-  status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff);
-  status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000);
-  status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000);
-  status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000);
-  status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000);
-  status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000);
-  status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000);
-  status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000);
-  status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000);
-  status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000);
-  status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000);
-  status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000);
-  status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000);
-  status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000);
-  status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000);
-  status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000);
-  status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000);
-  status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000);
-  status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000);
-  status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000);
-  status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000);
-  status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000);
-  status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000);
-  status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000);
-  status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000);
-  status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000);
-  status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000);
-  status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005);
-  status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002);
-  status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004);
-  status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001);
-  status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000);
-  status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8);
-  status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa);
-  status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000);
-  status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000);
-  status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000);
-  status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff);
-  status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000);
-  status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000);
-  status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000);
-  status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001);
-  status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe);
-  status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe);
-  status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe);
-  status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000);
-  status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000);
-  status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe);
-  status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009);
-  status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009);
-  status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000);
-  status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000);
-  status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000);
-  status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000);
-  status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002);
-  status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002);
-  status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003);
-  status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003);
-  status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003);
-  status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa);
-  status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001);
-  status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005);
-  status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009);
-  status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000);
-  status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff);
-  status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001);
-  status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000);
-  status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000);
-  status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000);
-  status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe);
-  status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000);
-  status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000);
-  status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000);
-  status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000);
-  status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003);
-  status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff);
-  status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000);
-  status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff);
-  status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000);
-  status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000);
-  status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe);
-  status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002);
-  status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000);
-  status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000);
-  status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000);
-  status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000);
-  status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8);
-  status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd);
-  status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd);
-  status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000);
-  status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000);
-  status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000);
-  status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000);
-  status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000);
-  status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000);
-  status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000);
-  status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000);
-  status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000);
-  status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000);
-  status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000);
-  status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000);
-  status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000);
-  status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000);
-  status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000);
-  status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000);
-  status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000);
-  status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000);
-  status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000);
-  status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f);
-  status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d);
-  status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e);
-  status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa);
-  status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e);
-  status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5);
-  status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f);
-  status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d);
-  status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e);
-  status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa);
-  status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e);
-  status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5);
-  status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f);
-  status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d);
-  status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e);
-  status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa);
-  status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e);
-  status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5);
-  status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f);
-  status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d);
-  status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e);
-  status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa);
-  status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e);
-  status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5);
-  status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000);
-  status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001);
-  status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200);
-  status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201);
-  status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85);
-  status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c);
-  status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000);
-  status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e);
-  status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98);
-  status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000);
-  status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c);
-  status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb);
-  status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000);
-  status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de);
-  status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f);
-  status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000);
-  status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090);
-  status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412);
-  status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000);
-  status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09);
-  status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842);
-  status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000);
-  status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21);
-  status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78);
-  status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000);
-  status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c);
-  status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d);
-  status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000);
-  status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e);
-  status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca);
-  status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000);
-  status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265);
-  status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5);
-  status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000);
-  status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263);
-  status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b);
-  status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000);
-  status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae);
-  status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa);
-  status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000);
-  status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955);
-  status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b);
-  status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000);
-  status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d);
-  status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f);
-  status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000);
-  status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150);
-  status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843);
-  status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000);
-  status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421);
-  status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036);
-  status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000);
-  status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b);
-  status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489);
-  status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000);
-  status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244);
-  status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea);
-  status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000);
-  status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75);
-  status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be);
-  status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000);
-  status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f);
-  status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52);
-  status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000);
-  status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9);
-  status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d);
-  status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000);
-  status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe);
-  status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a);
-  status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000);
-  status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295);
-  status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312);
-  status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000);
-  status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989);
-  status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a);
-  status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000);
-  status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d);
-  status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3);
-  status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000);
-  status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1);
-  status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7);
-  status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000);
-  status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574);
-  status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f);
-  status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000);
-  status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418);
-  status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886);
-  status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000);
-  status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43);
-  status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de);
-  status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000);
-  status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f);
-  status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9);
-  status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000);
-  status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc);
-  status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c);
-  status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000);
-  status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6);
-  status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654);
-  status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000);
-  status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a);
-  status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014);
-  status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000);
-  status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a);
-  status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670);
-  status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000);
-  status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338);
-  status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27);
-  status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000);
-  status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794);
-  status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8);
-  status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000);
-  status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc);
-  status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3);
-  status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000);
-  status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2);
-  status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c);
-  status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000);
-  status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e);
-  status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0);
-  status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000);
-  status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0);
-  status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9);
-  status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000);
-  status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c);
-  status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b);
-  status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000);
-  status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5);
-  status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987);
-  status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000);
-  status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4);
-  status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2);
-  status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000);
-  status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9);
-  status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3);
-  status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000);
-  status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a);
-  status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03);
-  status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000);
-  status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782);
-  status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705);
-  status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000);
-  status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83);
-  status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02);
-  status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000);
-  status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581);
-  status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1);
-  status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000);
-  status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058);
-  status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9);
-  status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000);
-  status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5);
-  status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8);
-  status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000);
-  status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc);
-  status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec);
-  status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000);
-  status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6);
-  status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9);
-  status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000);
-  status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264);
-  status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6);
-  status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000);
-  status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3);
-  status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27);
-  status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000);
-  status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794);
-  status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e);
-  status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000);
-  status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97);
-  status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47);
-  status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000);
-  status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524);
-  status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe);
-  status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000);
-  status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff);
-  status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc);
-  status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000);
-  status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66);
-  status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c);
-  status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000);
-  status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e);
-  status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b);
-  status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000);
-  status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d);
-  status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856);
-  status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000);
-  status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b);
-  status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68);
-  status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000);
-  status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4);
-  status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae);
-  status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000);
-  status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057);
-  status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47);
-  status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000);
-  status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23);
-  status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6);
-  status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000);
-  status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b);
-  status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95);
-  status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000);
-  status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb);
-  status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000);
-  status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007);
-  status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007);
-  status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007);
-  status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007);
-  status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007);
-  status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007);
-  status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007);
-  status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007);
-  status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f);
-  status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00);
-  status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21);
-  status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001);
-  status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001);
-  status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000);
-
-  // Test that the result of an operation is a NaN at all when it should be.
-  //
-  // In most configurations these tests' results are checked compared using
-  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
-  // which causes compareResultF to accept any NaN encoding. We also use the
-  // same value as the input NaN in tests that have one, so that even in
-  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
-  // still the exact expected NaN.
-  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
-  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
-  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
-  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
-  status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
-  status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
-  status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
-
-#ifdef ARM_NAN_HANDLING
-  // Tests specific to the NaN handling of Arm hardware, mimicked by
-  // arm/mulsf3.S:
-  //
-  //  - a quiet NaN is distinguished by the top mantissa bit being 1
-  //
-  //  - if a signalling NaN appears in the input, the output quiet NaN is
-  //    obtained by setting its top mantissa bit and leaving everything else
-  //    unchanged
-  //
-  //  - if both operands are signalling NaNs then the output NaN is derived
-  //    from the first operand
-  //
-  //  - if both operands are quiet NaNs then the output NaN is the first
-  //    operand
-  //
-  //  - invalid operations not involving an input NaN return the quiet
-  //    NaN with fewest bits set, 0x7fc00000.
-
-  status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
-  status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
-  status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
-  status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716);
-  status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
-  status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
-  status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
-  status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
-  status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
-  status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
-  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
-  status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
-  status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
-  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
-  status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
-  status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
-  status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
-  status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
-  status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
-  status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
-  status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
-  status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
-  status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
-  status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
-  status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
-  status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
-  status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
-  status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
-  status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
-  status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
-  status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
-  status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
-  status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
-  status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
-  status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
-  status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
-  status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
-  status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
-  status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
-  status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
-  status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
-  status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329);
-  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
-  status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
-  status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
-  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
-  status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
-  status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
-  status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
-  status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
-  status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
-  status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
-  status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
-  status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
-  status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
-  status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397);
-#endif // ARM_NAN_HANDLING
-
-  return status;
-}

From de3d74aa5de51bd2ed0c461d98634723592af700 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 13 Nov 2025 18:01:35 +0100
Subject: [PATCH 18/25] [CIR] Implement support for GNUNullExpr (#167715)

Implement support for GNUNullExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  4 ++++
 clang/test/CIR/CodeGen/gnu-null.cpp        | 28 ++++++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 clang/test/CIR/CodeGen/gnu-null.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 1c4f51c11dc5e..6e87fd2c0d04f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -199,6 +199,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return emitNullValue(e->getType(), cgf.getLoc(e->getSourceRange()));
   }
 
+  mlir::Value VisitGNUNullExpr(const GNUNullExpr *e) {
+    return emitNullValue(e->getType(), cgf.getLoc(e->getSourceRange()));
+  }
+
   mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     if (e->isGLValue())
       return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
diff --git a/clang/test/CIR/CodeGen/gnu-null.cpp b/clang/test/CIR/CodeGen/gnu-null.cpp
new file mode 100644
index 0000000000000..d1d15f2007621
--- /dev/null
+++ b/clang/test/CIR/CodeGen/gnu-null.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void gnu_null_expr() {
+  long a = __null;
+  int *b = __null;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["a", init]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["b", init]
+// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s64i
+// CIR: cir.store {{.*}} %[[CONST_0]], %[[A_ADDR]] : !s64i, !cir.ptr<!s64i>
+// CIR: %[[CONST_NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR: cir.store {{.*}} %[[CONST_NULL]], %[[B_ADDR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca i64, i64 1, align 8
+// LLVM: %[[B_ADDR:.*]] = alloca ptr, i64 1, align 8
+// LLVM: store i64 0, ptr %[[A_ADDR]], align 8
+// LLVM: store ptr null, ptr %[[B_ADDR]], align 8
+
+// OGCG: %[[A_ADDR:.*]] = alloca i64, align 8
+// OGCG: %[[B_ADDR:.*]] = alloca ptr, align 8
+// OGCG: store i64 0, ptr %[[A_ADDR]], align 8
+// OGCG: store ptr null, ptr %[[B_ADDR]], align 8

From 6b49e6a14fa2e00de1fc0bbe60bd304299be516d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 13 Nov 2025 11:01:38 -0600
Subject: [PATCH 19/25] [libc][NFC] Fix warnings in RPC server code

---
 libc/shared/rpc_opcodes.h           | 6 +++---
 libc/src/__support/RPC/rpc_server.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/shared/rpc_opcodes.h b/libc/shared/rpc_opcodes.h
index 583d622e1fa0d..a9c4f5521021e 100644
--- a/libc/shared/rpc_opcodes.h
+++ b/libc/shared/rpc_opcodes.h
@@ -47,9 +47,9 @@ typedef enum {
   LIBC_SYSTEM = LLVM_LIBC_OPCODE(29),
 
   // Internal opcodes for testing.
-  LIBC_TEST_INCREMENT = LLVM_LIBC_OPCODE(1 << 15),
-  LIBC_TEST_INTERFACE = LLVM_LIBC_OPCODE((1 << 15) + 1),
-  LIBC_TEST_STREAM = LLVM_LIBC_OPCODE((1 << 15) + 2),
+  LIBC_TEST_INCREMENT = LLVM_LIBC_OPCODE((1 << 15)),
+  LIBC_TEST_INTERFACE = LLVM_LIBC_OPCODE(((1 << 15) + 1)),
+  LIBC_TEST_STREAM = LLVM_LIBC_OPCODE(((1 << 15) + 2)),
   LIBC_LAST = 0xFFFFFFFF,
 } rpc_opcode_t;
 
diff --git a/libc/src/__support/RPC/rpc_server.h b/libc/src/__support/RPC/rpc_server.h
index 4c8242acafd28..abd604ae48146 100644
--- a/libc/src/__support/RPC/rpc_server.h
+++ b/libc/src/__support/RPC/rpc_server.h
@@ -298,7 +298,7 @@ LIBC_INLINE static void handle_printf(rpc::Server::Port &port,
 
     results[lane] = static_cast<int>(
         fwrite(buffer, 1, writer.get_chars_written(), files[lane]));
-    if (results[lane] != writer.get_chars_written() || ret == -1)
+    if (size_t(results[lane]) != writer.get_chars_written() || ret == -1)
       results[lane] = -1;
   }
 

From 09122fecc957408ff0a1bbb4acff319159f04e71 Mon Sep 17 00:00:00 2001
From: Doug Wyatt <doug@sonosphere.com>
Date: Thu, 13 Nov 2025 09:04:41 -0800
Subject: [PATCH 20/25] Clang: Remove `-Wperf-constraint-implies-noexcept` from
 `-Wall`. (#167540)

In adopting `[[clang::nonblocking]]` there's been some user confusion.
Changes to address `-Wfunction-effects` warnings are often pure
annotation, with no runtime effect. Changes to avoid
`-Wperf-constraint-implies-noexcept` warnings are risky: adding
`noexcept` creates a new potential for the program to crash. In
retrospect, `-Wperf-constraint-implies-noexcept` shouldn't have been
made part of `-Wall`.

---------

Co-authored-by: Doug Wyatt <dwyatt@apple.com>
---
 clang/docs/ReleaseNotes.rst                   | 2 ++
 clang/include/clang/Basic/DiagnosticGroups.td | 2 +-
 clang/test/Misc/warning-wall.c                | 1 -
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b3273e39a6279..09eed889b01a8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -359,6 +359,8 @@ Improvements to Clang's diagnostics
   Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
   Added a new warning in this group for the case where the attribute is missing/implicit on
   an override of a virtual method.
+- Remove ``-Wperf-constraint-implies-noexcept`` from ``-Wall``. This warning is somewhat nit-picky and
+  attempts to resolve it, by adding ``noexcept``, can create new ways for programs to crash. (#GH167540)
 - Implemented diagnostics when retrieving the tuple size for types where its specialization of `std::tuple_size`
   produces an invalid size (either negative or greater than the implementation limit). (#GH159563)
 - Fixed fix-it hint for fold expressions. Clang now correctly places the suggested right
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 1e0321de3f4b6..2fff32bbc4d6c 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1314,7 +1314,7 @@ def Consumed       : DiagGroup<"consumed">;
 // DefaultIgnore in addition to putting it here.
 def All : DiagGroup<"all", [Most, Parentheses, Switch, SwitchBool,
                             MisleadingIndentation, PackedNonPod,
-                            VLACxxExtension, PerfConstraintImpliesNoexcept]>;
+                            VLACxxExtension]>;
 
 // Warnings that should be in clang-cl /w4.
 def : DiagGroup<"CL4", [All, Extra]>;
diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c
index 689868c62f6a7..83b8d4d1f2c29 100644
--- a/clang/test/Misc/warning-wall.c
+++ b/clang/test/Misc/warning-wall.c
@@ -109,6 +109,5 @@ CHECK-NEXT:  -Wmisleading-indentation
 CHECK-NEXT:  -Wpacked-non-pod
 CHECK-NEXT:  -Wvla-cxx-extension
 CHECK-NEXT:    -Wvla-extension-static-assert
-CHECK-NEXT:  -Wperf-constraint-implies-noexcept
 
 CHECK-NOT:-W

From e0aec1f4762ac4e656390e26a42286bdb16d3792 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Thu, 13 Nov 2025 18:11:53 +0100
Subject: [PATCH 21/25] [RISCV] For (2^N +/- 2^M) muls, prefer ADD to SUB
 (#166757)

This changes muls by `3 << C` from `(X << C + 2) - (X << C)`
to `(X << C + 1) + (X << C)`.
If Zba is available, the output is not affected as we emit
`(shl (sh1add X, X), C)` instead.

There are two advantages:
- ADD is more compressible
- Often a reduced instruction count, by a heuristic that
  `(X << C + 1)` is more likely to have another use than `(X << C + 2)`
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   25 +-
 llvm/test/CodeGen/RISCV/mul.ll                |   56 +-
 llvm/test/CodeGen/RISCV/pr145360.ll           |   16 +-
 llvm/test/CodeGen/RISCV/rv32xtheadba.ll       |   16 +-
 llvm/test/CodeGen/RISCV/rv32zba.ll            |   20 +-
 llvm/test/CodeGen/RISCV/rv64xtheadba.ll       |   24 +-
 llvm/test/CodeGen/RISCV/rv64zba.ll            |   44 +-
 .../CodeGen/RISCV/rvv/calling-conv-fastcc.ll  |   23 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    | 1240 ++++++++---------
 .../RISCV/rvv/vreductions-fp-sdnode.ll        |    6 +-
 .../CodeGen/RISCV/srem-seteq-illegal-types.ll |   40 +-
 llvm/test/CodeGen/RISCV/xqciac.ll             |    4 +-
 12 files changed, 742 insertions(+), 772 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6306c6db37083..38cce26e44af4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16791,22 +16791,35 @@ static SDValue expandMulToNAFSequence(SDNode *N, SelectionDAG &DAG,
 static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
                                         uint64_t MulAmt) {
   uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
+  SDValue X = N->getOperand(0);
   ISD::NodeType Op;
   uint64_t ShiftAmt1;
-  if (isPowerOf2_64(MulAmt + MulAmtLowBit)) {
-    Op = ISD::SUB;
-    ShiftAmt1 = MulAmt + MulAmtLowBit;
-  } else if (isPowerOf2_64(MulAmt - MulAmtLowBit)) {
+  bool CanSub = isPowerOf2_64(MulAmt + MulAmtLowBit);
+  auto PreferSub = [X, MulAmtLowBit]() {
+    // For MulAmt == 3 << M both (X << M + 2) - (X << M)
+    // and (X << M + 1) + (X << M) are valid expansions.
+    // Prefer SUB if we can get (X << M + 2) for free,
+    // because X is exact (Y >> M + 2).
+    uint64_t ShAmt = Log2_64(MulAmtLowBit) + 2;
+    using namespace SDPatternMatch;
+    return sd_match(X, m_AnyOf(m_Sra(m_Value(), m_SpecificInt(ShAmt)),
+                               m_Srl(m_Value(), m_SpecificInt(ShAmt)))) &&
+           X->getFlags().hasExact();
+  };
+  if (isPowerOf2_64(MulAmt - MulAmtLowBit) && !(CanSub && PreferSub())) {
     Op = ISD::ADD;
     ShiftAmt1 = MulAmt - MulAmtLowBit;
+  } else if (CanSub) {
+    Op = ISD::SUB;
+    ShiftAmt1 = MulAmt + MulAmtLowBit;
   } else {
     return SDValue();
   }
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
-  SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+  SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X,
                                DAG.getConstant(Log2_64(ShiftAmt1), DL, VT));
-  SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+  SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, X,
                                DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT));
   return DAG.getNode(Op, DL, VT, Shift1, Shift2);
 }
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 4c9a98cabb15f..4533e14c672e7 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1185,29 +1185,29 @@ define i32 @muli32_p384(i32 %a) nounwind {
 ; RV32I-LABEL: muli32_p384:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 7
-; RV32I-NEXT:    slli a0, a0, 9
-; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli32_p384:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 7
-; RV32IM-NEXT:    slli a0, a0, 9
-; RV32IM-NEXT:    sub a0, a0, a1
+; RV32IM-NEXT:    slli a0, a0, 8
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muli32_p384:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 7
-; RV64I-NEXT:    slli a0, a0, 9
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_p384:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 7
-; RV64IM-NEXT:    slli a0, a0, 9
-; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 8
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 384
   ret i32 %1
@@ -1217,29 +1217,29 @@ define i32 @muli32_p12288(i32 %a) nounwind {
 ; RV32I-LABEL: muli32_p12288:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 12
-; RV32I-NEXT:    slli a0, a0, 14
-; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 13
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli32_p12288:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 12
-; RV32IM-NEXT:    slli a0, a0, 14
-; RV32IM-NEXT:    sub a0, a0, a1
+; RV32IM-NEXT:    slli a0, a0, 13
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muli32_p12288:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 12
-; RV64I-NEXT:    slli a0, a0, 14
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_p12288:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 12
-; RV64IM-NEXT:    slli a0, a0, 14
-; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 13
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 12288
   ret i32 %1
@@ -2117,14 +2117,14 @@ define i64 @muland_demand(i64 %x) nounwind {
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    andi a0, a0, -8
 ; RV32IM-NEXT:    slli a2, a1, 2
-; RV32IM-NEXT:    slli a1, a1, 4
-; RV32IM-NEXT:    sub a1, a1, a2
+; RV32IM-NEXT:    slli a1, a1, 3
+; RV32IM-NEXT:    add a1, a1, a2
 ; RV32IM-NEXT:    li a2, 12
 ; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
 ; RV32IM-NEXT:    slli a2, a0, 2
-; RV32IM-NEXT:    slli a0, a0, 4
-; RV32IM-NEXT:    sub a0, a0, a2
+; RV32IM-NEXT:    slli a0, a0, 3
+; RV32IM-NEXT:    add a0, a0, a2
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muland_demand:
@@ -2133,16 +2133,16 @@ define i64 @muland_demand(i64 %x) nounwind {
 ; RV64I-NEXT:    srli a1, a1, 2
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muland_demand:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    andi a0, a0, -8
 ; RV64IM-NEXT:    slli a1, a0, 2
-; RV64IM-NEXT:    slli a0, a0, 4
-; RV64IM-NEXT:    sub a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 3
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
   %and = and i64 %x, 4611686018427387896
   %mul = mul i64 %and, 12
@@ -2171,15 +2171,15 @@ define i64 @mulzext_demand(i32 signext %x) nounwind {
 ; RV64I-LABEL: mulzext_demand:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    slli a0, a0, 34
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: mulzext_demand:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 32
-; RV64IM-NEXT:    slli a0, a0, 34
-; RV64IM-NEXT:    sub a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 33
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret
   %ext = zext i32 %x to i64
   %mul = mul i64 %ext, 12884901888
diff --git a/llvm/test/CodeGen/RISCV/pr145360.ll b/llvm/test/CodeGen/RISCV/pr145360.ll
index 1c77fadbd4b7d..013bab4ce6292 100644
--- a/llvm/test/CodeGen/RISCV/pr145360.ll
+++ b/llvm/test/CodeGen/RISCV/pr145360.ll
@@ -27,11 +27,11 @@ define i32 @unsigned(i32 %0, ptr %1) {
 ; CHECK-NEXT:    slli a4, a3, 32
 ; CHECK-NEXT:    mulhu a2, a2, a4
 ; CHECK-NEXT:    srli a2, a2, 36
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub a2, a2, a4
+; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    srliw a4, a0, 3
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    sub a2, a0, a2
 ; CHECK-NEXT:    mulw a0, a4, a3
 ; CHECK-NEXT:    sw a2, 0(a1)
 ; CHECK-NEXT:    ret
@@ -68,10 +68,10 @@ define i32 @unsigned_div_first(i32 %0, ptr %1) {
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    mulhu a2, a2, a3
 ; CHECK-NEXT:    srli a2, a2, 36
-; CHECK-NEXT:    slli a3, a2, 5
-; CHECK-NEXT:    slli a4, a2, 3
-; CHECK-NEXT:    sub a4, a4, a3
-; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    slli a3, a2, 3
+; CHECK-NEXT:    slli a4, a2, 4
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    sw a0, 0(a1)
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 0e4a5c07020ee..fd341da86599f 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -98,8 +98,8 @@ define i32 @addmul6(i32 %a, i32 %b) {
 ; RV32I-LABEL: addmul6:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 1
-; RV32I-NEXT:    slli a0, a0, 3
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -136,8 +136,8 @@ define i32 @addmul12(i32 %a, i32 %b) {
 ; RV32I-LABEL: addmul12:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 2
-; RV32I-NEXT:    slli a0, a0, 4
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -193,8 +193,8 @@ define i32 @addmul24(i32 %a, i32 %b) {
 ; RV32I-LABEL: addmul24:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 3
-; RV32I-NEXT:    slli a0, a0, 5
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -269,8 +269,8 @@ define i32 @mul96(i32 %a) {
 ; RV32I-LABEL: mul96:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 5
-; RV32I-NEXT:    slli a0, a0, 7
-; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 6
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32XTHEADBA-LABEL: mul96:
diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll
index a6dbd94caad4f..ea9d117f2e2e3 100644
--- a/llvm/test/CodeGen/RISCV/rv32zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zba.ll
@@ -85,8 +85,8 @@ define i32 @addmul6(i32 %a, i32 %b) {
 ; RV32I-LABEL: addmul6:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 1
-; RV32I-NEXT:    slli a0, a0, 3
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -135,8 +135,8 @@ define i32 @addmul12(i32 %a, i32 %b) {
 ; RV32I-LABEL: addmul12:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 2
-; RV32I-NEXT:    slli a0, a0, 4
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 3
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -210,8 +210,8 @@ define i32 @addmul24(i32 %a, i32 %b) {
 ; RV32I-LABEL: addmul24:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 3
-; RV32I-NEXT:    slli a0, a0, 5
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -310,8 +310,8 @@ define i32 @mul96(i32 %a) {
 ; RV32I-LABEL: mul96:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 5
-; RV32I-NEXT:    slli a0, a0, 7
-; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    slli a0, a0, 6
+; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: mul96:
@@ -1272,8 +1272,8 @@ define ptr @shl_add_knownbits(ptr %p, i32 %i) {
 ; RV32I-NEXT:    slli a1, a1, 18
 ; RV32I-NEXT:    srli a1, a1, 18
 ; RV32I-NEXT:    slli a2, a1, 1
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    srli a1, a1, 3
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index f4964288e3541..c57dfca1389b6 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -94,8 +94,8 @@ define i64 @addmul6(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul6:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 1
-; RV64I-NEXT:    slli a0, a0, 3
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -113,8 +113,8 @@ define i64 @disjointormul6(i64 %a, i64 %b) {
 ; RV64I-LABEL: disjointormul6:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 1
-; RV64I-NEXT:    slli a0, a0, 3
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -151,8 +151,8 @@ define i64 @addmul12(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul12:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -227,8 +227,8 @@ define i64 @addmul24(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul24:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 3
-; RV64I-NEXT:    slli a0, a0, 5
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -527,8 +527,8 @@ define i64 @mul96(i64 %a) {
 ; RV64I-LABEL: mul96:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 5
-; RV64I-NEXT:    slli a0, a0, 7
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 6
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64XTHEADBA-LABEL: mul96:
@@ -990,8 +990,8 @@ define signext i32 @mulw192(i32 signext %a) {
 ; RV64I-LABEL: mulw192:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 6
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64XTHEADBA-LABEL: mulw192:
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 156599fb72877..4ab4ff84dac57 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -489,8 +489,8 @@ define i64 @addmul6(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul6:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 1
-; RV64I-NEXT:    slli a0, a0, 3
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -514,8 +514,8 @@ define i64 @disjointormul6(i64 %a, i64 %b) {
 ; RV64I-LABEL: disjointormul6:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 1
-; RV64I-NEXT:    slli a0, a0, 3
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -564,8 +564,8 @@ define i64 @addmul12(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul12:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -692,8 +692,8 @@ define i64 @addmul24(i64 %a, i64 %b) {
 ; RV64I-LABEL: addmul24:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 3
-; RV64I-NEXT:    slli a0, a0, 5
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -1350,8 +1350,8 @@ define i64 @mul96(i64 %a) {
 ; RV64I-LABEL: mul96:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 5
-; RV64I-NEXT:    slli a0, a0, 7
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 6
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: mul96:
@@ -1618,8 +1618,8 @@ define i64 @zext_mul96(i32 signext %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a0, 27
-; RV64I-NEXT:    srli a0, a0, 25
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a0, a0, 26
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_mul96:
@@ -1724,8 +1724,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    slli a0, a0, 34
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
@@ -2336,8 +2336,8 @@ define signext i32 @mulw192(i32 signext %a) {
 ; RV64I-LABEL: mulw192:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 6
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 7
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: mulw192:
@@ -4055,8 +4055,8 @@ define i64 @regression(i32 signext %x, i32 signext %y) {
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a0, 29
-; RV64I-NEXT:    srli a0, a0, 27
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a0, a0, 28
+; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: regression:
@@ -4190,8 +4190,8 @@ define i64 @bext_mul12(i32 %1, i32 %2) {
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    andi a0, a0, 1
 ; RV64I-NEXT:    slli a1, a0, 2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBANOZBB-LABEL: bext_mul12:
@@ -4988,8 +4988,8 @@ define ptr @shl_add_knownbits(ptr %p, i64 %i) {
 ; RV64I-NEXT:    slli a1, a1, 50
 ; RV64I-NEXT:    srli a1, a1, 50
 ; RV64I-NEXT:    slli a2, a1, 1
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    srli a1, a1, 3
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index bd912193c4fed..39732602cc85e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -72,9 +72,8 @@ define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) {
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vl8re32.v v8, (a1)
 ; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    slli a4, a2, 5
 ; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    sub a4, a4, a3
+; CHECK-NEXT:    add a4, a2, a3
 ; CHECK-NEXT:    add a5, a1, a2
 ; CHECK-NEXT:    vl8re32.v v16, (a5)
 ; CHECK-NEXT:    add a5, a1, a3
@@ -112,16 +111,16 @@ define fastcc <vscale x 128 x i32> @ret_split_nxv128i32(ptr %x) {
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    slli a5, a2, 4
+; CHECK-NEXT:    slli a4, a2, 4
+; CHECK-NEXT:    slli a5, a2, 5
 ; CHECK-NEXT:    slli a2, a2, 6
-; CHECK-NEXT:    sub a6, a4, a3
-; CHECK-NEXT:    add a7, a4, a3
-; CHECK-NEXT:    sub t0, a2, a5
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    add t0, a5, a4
 ; CHECK-NEXT:    sub a2, a2, a3
 ; CHECK-NEXT:    add t1, a1, a3
-; CHECK-NEXT:    add t2, a1, a5
-; CHECK-NEXT:    add t3, a1, a4
+; CHECK-NEXT:    add t2, a1, a4
+; CHECK-NEXT:    add t3, a1, a5
 ; CHECK-NEXT:    vl8re32.v v8, (t1)
 ; CHECK-NEXT:    csrr t1, vlenb
 ; CHECK-NEXT:    slli t1, t1, 4
@@ -157,12 +156,12 @@ define fastcc <vscale x 128 x i32> @ret_split_nxv128i32(ptr %x) {
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vs8r.v v0, (a0)
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vs8r.v v16, (a4)
 ; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    vs8r.v v16, (a5)
+; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vs8r.v v16, (a5)
+; CHECK-NEXT:    vs8r.v v16, (a4)
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index ee38257f09cd5..0577fb1ff67bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -3852,29 +3852,26 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -3893,23 +3890,25 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV32-NEXT:    vl1r.v v8, (a0)
 ; RV32-NEXT:    vl1r.v v16, (a4)
 ; RV32-NEXT:    vl1r.v v9, (a3)
-; RV32-NEXT:    vl1r.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1r.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1r.v v14, (a6)
 ; RV32-NEXT:    vl1r.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8r.v v16, (a2)
@@ -3934,29 +3933,26 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -3975,23 +3971,25 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; RV64-NEXT:    vl1r.v v8, (a0)
 ; RV64-NEXT:    vl1r.v v16, (a4)
 ; RV64-NEXT:    vl1r.v v9, (a3)
-; RV64-NEXT:    vl1r.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1r.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1r.v v14, (a6)
 ; RV64-NEXT:    vl1r.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8r.v v16, (a2)
@@ -4016,29 +4014,26 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -4057,23 +4052,25 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV32-NEXT:    vl1r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1r.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1r.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1r.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1r.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1r.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8r.v v16, (a2)
@@ -4098,29 +4095,26 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -4139,23 +4133,25 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZVBB-RV64-NEXT:    vl1r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1r.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1r.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1r.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1r.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1r.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8r.v v16, (a2)
@@ -4180,29 +4176,26 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e8.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -4221,23 +4214,25 @@ define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8
 ; ZIP-NEXT:    vl1r.v v8, (a0)
 ; ZIP-NEXT:    vl1r.v v16, (a4)
 ; ZIP-NEXT:    vl1r.v v9, (a3)
-; ZIP-NEXT:    vl1r.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1r.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1r.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1r.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1r.v v14, (a6)
 ; ZIP-NEXT:    vl1r.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8r.v v16, (a2)
@@ -4268,29 +4263,26 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -4309,23 +4301,25 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV32-NEXT:    vl1re16.v v8, (a0)
 ; RV32-NEXT:    vl1re16.v v16, (a4)
 ; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re16.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re16.v v14, (a6)
 ; RV32-NEXT:    vl1re16.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re16.v v16, (a2)
@@ -4350,29 +4344,26 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -4391,23 +4382,25 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; RV64-NEXT:    vl1re16.v v8, (a0)
 ; RV64-NEXT:    vl1re16.v v16, (a4)
 ; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re16.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re16.v v14, (a6)
 ; RV64-NEXT:    vl1re16.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re16.v v16, (a2)
@@ -4432,29 +4425,26 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -4473,23 +4463,25 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
@@ -4514,29 +4506,26 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -4555,23 +4544,25 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
@@ -4596,29 +4587,26 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e16.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -4637,23 +4625,25 @@ define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16
 ; ZIP-NEXT:    vl1re16.v v8, (a0)
 ; ZIP-NEXT:    vl1re16.v v16, (a4)
 ; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re16.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re16.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re16.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re16.v v14, (a6)
 ; ZIP-NEXT:    vl1re16.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re16.v v16, (a2)
@@ -4684,29 +4674,26 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -4725,23 +4712,25 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV32-NEXT:    vl1re32.v v8, (a0)
 ; RV32-NEXT:    vl1re32.v v16, (a4)
 ; RV32-NEXT:    vl1re32.v v9, (a3)
-; RV32-NEXT:    vl1re32.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re32.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re32.v v14, (a6)
 ; RV32-NEXT:    vl1re32.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re32.v v16, (a2)
@@ -4766,29 +4755,26 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -4807,23 +4793,25 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; RV64-NEXT:    vl1re32.v v8, (a0)
 ; RV64-NEXT:    vl1re32.v v16, (a4)
 ; RV64-NEXT:    vl1re32.v v9, (a3)
-; RV64-NEXT:    vl1re32.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re32.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re32.v v14, (a6)
 ; RV64-NEXT:    vl1re32.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re32.v v16, (a2)
@@ -4848,29 +4836,26 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -4889,23 +4874,25 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
@@ -4930,29 +4917,26 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -4971,23 +4955,25 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
@@ -5012,29 +4998,26 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e32.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -5053,23 +5036,25 @@ define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32
 ; ZIP-NEXT:    vl1re32.v v8, (a0)
 ; ZIP-NEXT:    vl1re32.v v16, (a4)
 ; ZIP-NEXT:    vl1re32.v v9, (a3)
-; ZIP-NEXT:    vl1re32.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re32.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re32.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re32.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re32.v v14, (a6)
 ; ZIP-NEXT:    vl1re32.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re32.v v16, (a2)
@@ -5099,29 +5084,26 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -5140,23 +5122,25 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    vl1re64.v v8, (a0)
 ; RV32-NEXT:    vl1re64.v v16, (a4)
 ; RV32-NEXT:    vl1re64.v v9, (a3)
-; RV32-NEXT:    vl1re64.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re64.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re64.v v14, (a6)
 ; RV32-NEXT:    vl1re64.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re64.v v16, (a2)
@@ -5181,29 +5165,26 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -5222,23 +5203,25 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    vl1re64.v v8, (a0)
 ; RV64-NEXT:    vl1re64.v v16, (a4)
 ; RV64-NEXT:    vl1re64.v v9, (a3)
-; RV64-NEXT:    vl1re64.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re64.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re64.v v14, (a6)
 ; RV64-NEXT:    vl1re64.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re64.v v16, (a2)
@@ -5263,29 +5246,26 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -5304,23 +5284,25 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
@@ -5345,29 +5327,26 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -5386,23 +5365,25 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
@@ -5427,29 +5408,26 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e64.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -5468,23 +5446,25 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZIP-NEXT:    vl1re64.v v8, (a0)
 ; ZIP-NEXT:    vl1re64.v v16, (a4)
 ; ZIP-NEXT:    vl1re64.v v9, (a3)
-; ZIP-NEXT:    vl1re64.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re64.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re64.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re64.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re64.v v14, (a6)
 ; ZIP-NEXT:    vl1re64.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re64.v v16, (a2)
@@ -11853,29 +11833,26 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -11894,23 +11871,25 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; RV32-NEXT:    vl1re16.v v8, (a0)
 ; RV32-NEXT:    vl1re16.v v16, (a4)
 ; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re16.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re16.v v14, (a6)
 ; RV32-NEXT:    vl1re16.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re16.v v16, (a2)
@@ -11935,29 +11914,26 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -11976,23 +11952,25 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; RV64-NEXT:    vl1re16.v v8, (a0)
 ; RV64-NEXT:    vl1re16.v v16, (a4)
 ; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re16.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re16.v v14, (a6)
 ; RV64-NEXT:    vl1re16.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re16.v v16, (a2)
@@ -12017,29 +11995,26 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -12058,23 +12033,25 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
@@ -12099,29 +12076,26 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -12140,23 +12114,25 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
@@ -12181,29 +12157,26 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e16.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -12222,23 +12195,25 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZIP-NEXT:    vl1re16.v v8, (a0)
 ; ZIP-NEXT:    vl1re16.v v16, (a4)
 ; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re16.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re16.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re16.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re16.v v14, (a6)
 ; ZIP-NEXT:    vl1re16.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re16.v v16, (a2)
@@ -12415,29 +12390,26 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
+; RV32-NEXT:    vmv1r.v v2, v10
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv1r.v v4, v14
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v2, v10
 ; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
-; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -12456,23 +12428,25 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV32-NEXT:    vl1re16.v v8, (a0)
 ; RV32-NEXT:    vl1re16.v v16, (a4)
 ; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re16.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re16.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re16.v v14, (a6)
 ; RV32-NEXT:    vl1re16.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re16.v v16, (a2)
@@ -12497,29 +12471,26 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -12538,23 +12509,25 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV64-NEXT:    vl1re16.v v8, (a0)
 ; RV64-NEXT:    vl1re16.v v16, (a4)
 ; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re16.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re16.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re16.v v14, (a6)
 ; RV64-NEXT:    vl1re16.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re16.v v16, (a2)
@@ -12579,29 +12552,26 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -12620,23 +12590,25 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
@@ -12661,29 +12633,26 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -12702,23 +12671,25 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
@@ -12743,29 +12714,26 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e16.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -12784,23 +12752,25 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZIP-NEXT:    vl1re16.v v8, (a0)
 ; ZIP-NEXT:    vl1re16.v v16, (a4)
 ; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re16.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re16.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re16.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re16.v v14, (a6)
 ; ZIP-NEXT:    vl1re16.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re16.v v16, (a2)
@@ -12977,29 +12947,26 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -13018,23 +12985,25 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV32-NEXT:    vl1re32.v v8, (a0)
 ; RV32-NEXT:    vl1re32.v v16, (a4)
 ; RV32-NEXT:    vl1re32.v v9, (a3)
-; RV32-NEXT:    vl1re32.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re32.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re32.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re32.v v14, (a6)
 ; RV32-NEXT:    vl1re32.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re32.v v16, (a2)
@@ -13059,29 +13028,26 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -13100,23 +13066,25 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV64-NEXT:    vl1re32.v v8, (a0)
 ; RV64-NEXT:    vl1re32.v v16, (a4)
 ; RV64-NEXT:    vl1re32.v v9, (a3)
-; RV64-NEXT:    vl1re32.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re32.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re32.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re32.v v14, (a6)
 ; RV64-NEXT:    vl1re32.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re32.v v16, (a2)
@@ -13141,29 +13109,26 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -13182,23 +13147,25 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
@@ -13223,29 +13190,26 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -13264,23 +13228,25 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
@@ -13305,29 +13271,26 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e32.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -13346,23 +13309,25 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZIP-NEXT:    vl1re32.v v8, (a0)
 ; ZIP-NEXT:    vl1re32.v v16, (a4)
 ; ZIP-NEXT:    vl1re32.v v9, (a3)
-; ZIP-NEXT:    vl1re32.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re32.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re32.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re32.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re32.v v14, (a6)
 ; ZIP-NEXT:    vl1re32.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re32.v v16, (a2)
@@ -13457,29 +13422,26 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
 ; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv2r.v v20, v8
 ; RV32-NEXT:    vmv1r.v v1, v20
 ; RV32-NEXT:    vmv1r.v v3, v22
 ; RV32-NEXT:    vmv1r.v v5, v24
 ; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
 ; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    add a4, a1, a2
 ; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
 ; RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
@@ -13498,23 +13460,25 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; RV32-NEXT:    vl1re64.v v8, (a0)
 ; RV32-NEXT:    vl1re64.v v16, (a4)
 ; RV32-NEXT:    vl1re64.v v9, (a3)
-; RV32-NEXT:    vl1re64.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re64.v v17, (a5)
+; RV32-NEXT:    slli a3, a2, 2
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1re64.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a3, a2, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vl1re64.v v14, (a6)
 ; RV32-NEXT:    vl1re64.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a3, a0, a3
+; RV32-NEXT:    vs2r.v v20, (a3)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vl8re64.v v16, (a2)
@@ -13539,29 +13503,26 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
 ; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv2r.v v20, v8
 ; RV64-NEXT:    vmv1r.v v1, v20
 ; RV64-NEXT:    vmv1r.v v3, v22
 ; RV64-NEXT:    vmv1r.v v5, v24
 ; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
 ; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    add a4, a1, a2
 ; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
 ; RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
@@ -13580,23 +13541,25 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; RV64-NEXT:    vl1re64.v v8, (a0)
 ; RV64-NEXT:    vl1re64.v v16, (a4)
 ; RV64-NEXT:    vl1re64.v v9, (a3)
-; RV64-NEXT:    vl1re64.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re64.v v17, (a5)
+; RV64-NEXT:    slli a3, a2, 2
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1re64.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a3, a2, a3
 ; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    vl1re64.v v14, (a6)
 ; RV64-NEXT:    vl1re64.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a3, a0, a3
+; RV64-NEXT:    vs2r.v v20, (a3)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vl8re64.v v16, (a2)
@@ -13621,29 +13584,26 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
 ; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    add a4, a1, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
@@ -13662,23 +13622,25 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
 ; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a5)
+; ZVBB-RV32-NEXT:    slli a3, a2, 2
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a3, a2, a3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
 ; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
 ; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a3, a0, a3
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
@@ -13703,29 +13665,26 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
 ; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
 ; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
 ; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
 ; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
 ; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    add a4, a1, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
 ; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
@@ -13744,23 +13703,25 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
 ; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a5)
+; ZVBB-RV64-NEXT:    slli a3, a2, 2
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a3, a2, a3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
 ; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
 ; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a3, a0, a3
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a3)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
@@ -13785,29 +13746,26 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
 ; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv2r.v v20, v8
 ; ZIP-NEXT:    vmv1r.v v1, v20
 ; ZIP-NEXT:    vmv1r.v v3, v22
 ; ZIP-NEXT:    vmv1r.v v5, v24
 ; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
 ; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
 ; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    add a4, a1, a2
 ; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
 ; ZIP-NEXT:    vsseg7e64.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
@@ -13826,23 +13784,25 @@ define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x
 ; ZIP-NEXT:    vl1re64.v v8, (a0)
 ; ZIP-NEXT:    vl1re64.v v16, (a4)
 ; ZIP-NEXT:    vl1re64.v v9, (a3)
-; ZIP-NEXT:    vl1re64.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re64.v v17, (a5)
+; ZIP-NEXT:    slli a3, a2, 2
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re64.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1re64.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a3, a2, a3
 ; ZIP-NEXT:    add a2, a0, a2
 ; ZIP-NEXT:    vl1re64.v v14, (a6)
 ; ZIP-NEXT:    vl1re64.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a3, a0, a3
+; ZIP-NEXT:    vs2r.v v20, (a3)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
 ; ZIP-NEXT:    vl8re64.v v16, (a2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 861998a2ba51a..274ac18deb273 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -943,8 +943,7 @@ define half @vreduce_ord_fadd_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
@@ -1020,8 +1019,7 @@ define half @vreduce_fmax_nxv12f16(<vscale x 12 x half> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    li a1, -512
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v12, a1
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index bc23388315de7..06bbe5209df35 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -169,10 +169,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV32M-NEXT:    srli a2, a1, 31
 ; RV32M-NEXT:    srli a1, a1, 4
 ; RV32M-NEXT:    add a1, a1, a2
-; RV32M-NEXT:    slli a2, a1, 3
-; RV32M-NEXT:    slli a1, a1, 1
-; RV32M-NEXT:    sub a1, a1, a2
-; RV32M-NEXT:    add a0, a0, a1
+; RV32M-NEXT:    slli a2, a1, 1
+; RV32M-NEXT:    slli a1, a1, 2
+; RV32M-NEXT:    add a1, a1, a2
+; RV32M-NEXT:    sub a0, a0, a1
 ; RV32M-NEXT:    andi a0, a0, 15
 ; RV32M-NEXT:    addi a0, a0, -1
 ; RV32M-NEXT:    seqz a0, a0
@@ -187,10 +187,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV64M-NEXT:    srli a2, a1, 63
 ; RV64M-NEXT:    srli a1, a1, 4
 ; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    slli a2, a1, 3
-; RV64M-NEXT:    slli a1, a1, 1
-; RV64M-NEXT:    sub a1, a1, a2
-; RV64M-NEXT:    add a0, a0, a1
+; RV64M-NEXT:    slli a2, a1, 1
+; RV64M-NEXT:    slli a1, a1, 2
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    sub a0, a0, a1
 ; RV64M-NEXT:    andi a0, a0, 15
 ; RV64M-NEXT:    addi a0, a0, -1
 ; RV64M-NEXT:    seqz a0, a0
@@ -205,10 +205,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV32MV-NEXT:    srli a2, a1, 31
 ; RV32MV-NEXT:    srli a1, a1, 4
 ; RV32MV-NEXT:    add a1, a1, a2
-; RV32MV-NEXT:    slli a2, a1, 3
-; RV32MV-NEXT:    slli a1, a1, 1
-; RV32MV-NEXT:    sub a1, a1, a2
-; RV32MV-NEXT:    add a0, a0, a1
+; RV32MV-NEXT:    slli a2, a1, 1
+; RV32MV-NEXT:    slli a1, a1, 2
+; RV32MV-NEXT:    add a1, a1, a2
+; RV32MV-NEXT:    sub a0, a0, a1
 ; RV32MV-NEXT:    andi a0, a0, 15
 ; RV32MV-NEXT:    addi a0, a0, -1
 ; RV32MV-NEXT:    seqz a0, a0
@@ -223,10 +223,10 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; RV64MV-NEXT:    srli a2, a1, 63
 ; RV64MV-NEXT:    srli a1, a1, 4
 ; RV64MV-NEXT:    add a1, a1, a2
-; RV64MV-NEXT:    slli a2, a1, 3
-; RV64MV-NEXT:    slli a1, a1, 1
-; RV64MV-NEXT:    sub a1, a1, a2
-; RV64MV-NEXT:    add a0, a0, a1
+; RV64MV-NEXT:    slli a2, a1, 1
+; RV64MV-NEXT:    slli a1, a1, 2
+; RV64MV-NEXT:    add a1, a1, a2
+; RV64MV-NEXT:    sub a0, a0, a1
 ; RV64MV-NEXT:    andi a0, a0, 15
 ; RV64MV-NEXT:    addi a0, a0, -1
 ; RV64MV-NEXT:    seqz a0, a0
@@ -823,16 +823,16 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    srai a4, a4, 1
 ; RV64MV-NEXT:    mulh a6, a3, a6
 ; RV64MV-NEXT:    add a4, a4, a7
-; RV64MV-NEXT:    slli a7, a5, 3
-; RV64MV-NEXT:    slli a5, a5, 1
-; RV64MV-NEXT:    sub a5, a5, a7
+; RV64MV-NEXT:    slli a7, a5, 1
+; RV64MV-NEXT:    slli a5, a5, 2
+; RV64MV-NEXT:    add a5, a5, a7
 ; RV64MV-NEXT:    srli a7, a6, 63
 ; RV64MV-NEXT:    srai a6, a6, 1
 ; RV64MV-NEXT:    add a6, a6, a7
 ; RV64MV-NEXT:    add a2, a2, a4
 ; RV64MV-NEXT:    slli a4, a4, 3
 ; RV64MV-NEXT:    sub a2, a2, a4
-; RV64MV-NEXT:    add a1, a1, a5
+; RV64MV-NEXT:    sub a1, a1, a5
 ; RV64MV-NEXT:    li a4, -1
 ; RV64MV-NEXT:    srli a4, a4, 31
 ; RV64MV-NEXT:    vsext.vf8 v8, v10
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
index 918468bdf03d3..92be4c977dd82 100644
--- a/llvm/test/CodeGen/RISCV/xqciac.ll
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -172,8 +172,8 @@ define dso_local i32 @pow2minuspow2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IM-LABEL: pow2minuspow2:
 ; RV32IM:       # %bb.0: # %entry
 ; RV32IM-NEXT:    slli a2, a1, 7
-; RV32IM-NEXT:    slli a1, a1, 9
-; RV32IM-NEXT:    sub a1, a1, a2
+; RV32IM-NEXT:    slli a1, a1, 8
+; RV32IM-NEXT:    add a1, a1, a2
 ; RV32IM-NEXT:    add a0, a1, a0
 ; RV32IM-NEXT:    ret
 ;

From 98f9b54376247d769eb037dc1e12f82243d87cbe Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Thu, 13 Nov 2025 20:14:23 +0300
Subject: [PATCH 22/25] [CodeGen] Hide SparseSet<LiveRegUnit> behind a typedef
 (NFC) (#167898)

So that changing the type of the container (planned in a future patch)
is less intrusive.
---
 .../llvm/CodeGen/MachineTraceMetrics.h        | 13 +++----
 llvm/lib/CodeGen/MachineCombiner.cpp          |  7 ++--
 llvm/lib/CodeGen/MachineTraceMetrics.cpp      | 35 +++++++++----------
 3 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
index d51de24d64e8d..74b051d0cddc6 100644
--- a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -83,6 +83,8 @@ struct LiveRegUnit {
   LiveRegUnit(unsigned RU) : RegUnit(RU) {}
 };
 
+using LiveRegUnitSet = SparseSet<LiveRegUnit>;
+
 /// Strategies for selecting traces.
 enum class MachineTraceStrategy {
   /// Select the trace through a block that has the fewest instructions.
@@ -380,16 +382,15 @@ class MachineTraceMetrics {
     Trace getTrace(const MachineBasicBlock *MBB);
 
     /// Updates the depth of an machine instruction, given RegUnits.
-    void updateDepth(TraceBlockInfo &TBI, const MachineInstr&,
-                     SparseSet<LiveRegUnit> &RegUnits);
-    void updateDepth(const MachineBasicBlock *, const MachineInstr&,
-                     SparseSet<LiveRegUnit> &RegUnits);
+    void updateDepth(TraceBlockInfo &TBI, const MachineInstr &,
+                     LiveRegUnitSet &RegUnits);
+    void updateDepth(const MachineBasicBlock *, const MachineInstr &,
+                     LiveRegUnitSet &RegUnits);
 
     /// Updates the depth of the instructions from Start to End.
     void updateDepths(MachineBasicBlock::iterator Start,
                       MachineBasicBlock::iterator End,
-                      SparseSet<LiveRegUnit> &RegUnits);
-
+                      LiveRegUnitSet &RegUnits);
   };
 
   /// Get the trace ensemble representing the given trace selection strategy.
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 54e2a009b464d..205c79e71854f 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -482,9 +482,8 @@ insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
                          SmallVectorImpl<MachineInstr *> &InsInstrs,
                          SmallVectorImpl<MachineInstr *> &DelInstrs,
                          MachineTraceMetrics::Ensemble *TraceEnsemble,
-                         SparseSet<LiveRegUnit> &RegUnits,
-                         const TargetInstrInfo *TII, unsigned Pattern,
-                         bool IncrementalUpdate) {
+                         LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII,
+                         unsigned Pattern, bool IncrementalUpdate) {
   // If we want to fix up some placeholder for some target, do it now.
   // We need this because in genAlternativeCodeSequence, we have not decided the
   // better pattern InsInstrs or DelInstrs, so we don't want generate some
@@ -565,7 +564,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
   if (!TraceEnsemble)
     TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
 
-  SparseSet<LiveRegUnit> RegUnits;
+  LiveRegUnitSet RegUnits;
   RegUnits.setUniverse(TRI->getNumRegUnits());
 
   bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index c40bd1c83f34a..0312a8e33d669 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -737,7 +737,7 @@ static void getPHIDeps(const MachineInstr &UseMI,
 // tracking set when scanning instructions downwards.
 static void updatePhysDepsDownwards(const MachineInstr *UseMI,
                                     SmallVectorImpl<DataDep> &Deps,
-                                    SparseSet<LiveRegUnit> &RegUnits,
+                                    LiveRegUnitSet &RegUnits,
                                     const TargetRegisterInfo *TRI) {
   SmallVector<MCRegister, 8> Kills;
   SmallVector<unsigned, 8> LiveDefOps;
@@ -758,7 +758,7 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
     if (!MO.readsReg())
       continue;
     for (MCRegUnit Unit : TRI->regunits(Reg)) {
-      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit);
+      LiveRegUnitSet::iterator I = RegUnits.find(Unit);
       if (I == RegUnits.end())
         continue;
       Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
@@ -813,9 +813,9 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   return MaxLen;
 }
 
-void MachineTraceMetrics::Ensemble::
-updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
-            SparseSet<LiveRegUnit> &RegUnits) {
+void MachineTraceMetrics::Ensemble::updateDepth(TraceBlockInfo &TBI,
+                                                const MachineInstr &UseMI,
+                                                LiveRegUnitSet &RegUnits) {
   SmallVector<DataDep, 8> Deps;
   // Collect all data dependencies.
   if (UseMI.isPHI())
@@ -852,18 +852,17 @@ updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
   }
 }
 
-void MachineTraceMetrics::Ensemble::
-updateDepth(const MachineBasicBlock *MBB, const MachineInstr &UseMI,
-            SparseSet<LiveRegUnit> &RegUnits) {
+void MachineTraceMetrics::Ensemble::updateDepth(const MachineBasicBlock *MBB,
+                                                const MachineInstr &UseMI,
+                                                LiveRegUnitSet &RegUnits) {
   updateDepth(BlockInfo[MBB->getNumber()], UseMI, RegUnits);
 }
 
-void MachineTraceMetrics::Ensemble::
-updateDepths(MachineBasicBlock::iterator Start,
-             MachineBasicBlock::iterator End,
-             SparseSet<LiveRegUnit> &RegUnits) {
-    for (; Start != End; Start++)
-      updateDepth(Start->getParent(), *Start, RegUnits);
+void MachineTraceMetrics::Ensemble::updateDepths(
+    MachineBasicBlock::iterator Start, MachineBasicBlock::iterator End,
+    LiveRegUnitSet &RegUnits) {
+  for (; Start != End; Start++)
+    updateDepth(Start->getParent(), *Start, RegUnits);
 }
 
 /// Compute instruction depths for all instructions above or in MBB in its
@@ -887,7 +886,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
   // in the trace. We should track any live-out physregs that were defined in
   // the trace. This is quite rare in SSA form, typically created by CSE
   // hoisting a compare.
-  SparseSet<LiveRegUnit> RegUnits;
+  LiveRegUnitSet RegUnits;
   RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
 
   // Go through trace blocks in top-down order, stopping after the center block.
@@ -925,7 +924,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
 // Return the issue height of MI after considering any live regunits.
 // Height is the issue height computed from virtual register dependencies alone.
 static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
-                                      SparseSet<LiveRegUnit> &RegUnits,
+                                      LiveRegUnitSet &RegUnits,
                                       const TargetSchedModel &SchedModel,
                                       const TargetInstrInfo *TII,
                                       const TargetRegisterInfo *TRI) {
@@ -944,7 +943,7 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
     // This is a def of Reg. Remove corresponding entries from RegUnits, and
     // update MI Height to consider the physreg dependencies.
     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
-      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit);
+      LiveRegUnitSet::iterator I = RegUnits.find(Unit);
       if (I == RegUnits.end())
         continue;
       unsigned DepHeight = I->Cycle;
@@ -1048,7 +1047,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
 
   // For physregs, the def isn't known when we see the use.
   // Instead, keep track of the highest use of each regunit.
-  SparseSet<LiveRegUnit> RegUnits;
+  LiveRegUnitSet RegUnits;
   RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
 
   // If the bottom of the trace was already precomputed, initialize heights

From 9216e17fd2e76c65285c312a65ed503afcd5342c Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 13 Nov 2025 18:23:11 +0100
Subject: [PATCH 23/25] [CIR] Upstream basic support for ExtVector element expr
 (#167570)

Upstream the basic support for the ExtVectorType element expr
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 79 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  2 +
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  2 +
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  6 ++
 clang/lib/CIR/CodeGen/CIRGenValue.h           | 33 +++++++-
 clang/test/CIR/CodeGen/vector-ext-element.cpp | 46 +++++++++++
 6 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/CodeGen/vector-ext-element.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index c55fcabef0b3f..c67493a913d58 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -631,10 +631,49 @@ RValue CIRGenFunction::emitLoadOfLValue(LValue lv, SourceLocation loc) {
                                                  lv.getVectorIdx()));
   }
 
+  if (lv.isExtVectorElt())
+    return emitLoadOfExtVectorElementLValue(lv);
+
   cgm.errorNYI(loc, "emitLoadOfLValue");
   return RValue::get(nullptr);
 }
 
+int64_t CIRGenFunction::getAccessedFieldNo(unsigned int idx,
+                                           const mlir::ArrayAttr elts) {
+  auto elt = mlir::cast<mlir::IntegerAttr>(elts[idx]);
+  return elt.getInt();
+}
+
+// If this is a reference to a subset of the elements of a vector, create an
+// appropriate shufflevector.
+RValue CIRGenFunction::emitLoadOfExtVectorElementLValue(LValue lv) {
+  mlir::Location loc = lv.getExtVectorPointer().getLoc();
+  mlir::Value vec = builder.createLoad(loc, lv.getExtVectorAddress());
+
+  // HLSL allows treating scalars as one-element vectors. Converting the scalar
+  // IR value to a vector here allows the rest of codegen to behave as normal.
+  if (getLangOpts().HLSL && !mlir::isa<cir::VectorType>(vec.getType())) {
+    cgm.errorNYI(loc, "emitLoadOfExtVectorElementLValue: HLSL");
+    return {};
+  }
+
+  const mlir::ArrayAttr elts = lv.getExtVectorElts();
+
+  // If the result of the expression is a non-vector type, we must be extracting
+  // a single element. Just codegen as an extractelement.
+  const auto *exprVecTy = lv.getType()->getAs<clang::VectorType>();
+  if (!exprVecTy) {
+    int64_t indexValue = getAccessedFieldNo(0, elts);
+    cir::ConstantOp index =
+        builder.getConstInt(loc, builder.getSInt64Ty(), indexValue);
+    return RValue::get(cir::VecExtractOp::create(builder, loc, vec, index));
+  }
+
+  cgm.errorNYI(
+      loc, "emitLoadOfExtVectorElementLValue: Result of expr is vector type");
+  return {};
+}
+
 static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) {
   assert(!cir::MissingFeatures::weakRefReference());
   return cgm.getAddrOfFunction(gd);
@@ -1120,6 +1159,46 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
   return lv;
 }
 
+LValue CIRGenFunction::emitExtVectorElementExpr(const ExtVectorElementExpr *e) {
+  // Emit the base vector as an l-value.
+  LValue base;
+
+  // ExtVectorElementExpr's base can either be a vector or pointer to vector.
+  if (e->isArrow()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitExtVectorElementExpr: pointer to vector");
+    return {};
+  } else if (e->getBase()->isGLValue()) {
+    // Otherwise, if the base is an lvalue ( as in the case of foo.x.x),
+    // emit the base as an lvalue.
+    assert(e->getBase()->getType()->isVectorType());
+    base = emitLValue(e->getBase());
+  } else {
+    // Otherwise, the base is a normal rvalue (as in (V+V).x), emit it as such.
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitExtVectorElementExpr: base is a normal rvalue");
+    return {};
+  }
+
+  QualType type =
+      e->getType().withCVRQualifiers(base.getQuals().getCVRQualifiers());
+
+  // Encode the element access list into a vector of unsigned indices.
+  SmallVector<uint32_t, 4> indices;
+  e->getEncodedElementAccess(indices);
+
+  if (base.isSimple()) {
+    SmallVector<int64_t> attrElts(indices.begin(), indices.end());
+    mlir::ArrayAttr elts = builder.getI64ArrayAttr(attrElts);
+    return LValue::makeExtVectorElt(base.getAddress(), elts, type,
+                                    base.getBaseInfo());
+  }
+
+  cgm.errorNYI(e->getSourceRange(),
+               "emitExtVectorElementExpr: isSimple is false");
+  return {};
+}
+
 LValue CIRGenFunction::emitStringLiteralLValue(const StringLiteral *e,
                                                llvm::StringRef name) {
   cir::GlobalOp globalOp = cgm.getGlobalForStringLiteral(e, name);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 6e87fd2c0d04f..5d9188777741d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -283,6 +283,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
                                 e->getSourceRange().getBegin());
   }
 
+  mlir::Value VisitExtVectorElementExpr(Expr *e) { return emitLoadOfLValue(e); }
+
   mlir::Value VisitMemberExpr(MemberExpr *e);
 
   mlir::Value VisitCompoundLiteralExpr(CompoundLiteralExpr *e) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index cc75acc18c211..b73071af2a5d4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -887,6 +887,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitConditionalOperatorLValue(cast<BinaryConditionalOperator>(e));
   case Expr::ArraySubscriptExprClass:
     return emitArraySubscriptExpr(cast<ArraySubscriptExpr>(e));
+  case Expr::ExtVectorElementExprClass:
+    return emitExtVectorElementExpr(cast<ExtVectorElementExpr>(e));
   case Expr::UnaryOperatorClass:
     return emitUnaryOpLValue(cast<UnaryOperator>(e));
   case Expr::StringLiteralClass:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 4f5948b6e4467..3984f288b9bb0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1277,6 +1277,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                               QualType &baseType, Address &addr);
   LValue emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e);
 
+  LValue emitExtVectorElementExpr(const ExtVectorElementExpr *e);
+
   Address emitArrayToPointerDecay(const Expr *e,
                                   LValueBaseInfo *baseInfo = nullptr);
 
@@ -1342,6 +1344,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                                               mlir::Value emittedE,
                                               bool isDynamic);
 
+  int64_t getAccessedFieldNo(unsigned idx, mlir::ArrayAttr elts);
+
   RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                   const CIRGenCallee &callee, ReturnValueSlot returnValue,
                   const CallArgList &args, cir::CIRCallOpInterface *callOp,
@@ -1637,6 +1641,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// Load a complex number from the specified l-value.
   mlir::Value emitLoadOfComplex(LValue src, SourceLocation loc);
 
+  RValue emitLoadOfExtVectorElementLValue(LValue lv);
+
   /// Given an expression that represents a value lvalue, this method emits
   /// the address of the lvalue, then loads the result as an rvalue,
   /// returning the rvalue.
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index ab245a771d72c..20a3d0ef61341 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -166,7 +166,8 @@ class LValue {
   // this is the alignment of the whole vector)
   unsigned alignment;
   mlir::Value v;
-  mlir::Value vectorIdx; // Index for vector subscript
+  mlir::Value vectorIdx;      // Index for vector subscript
+  mlir::Attribute vectorElts; // ExtVector element subset: V.xyx
   mlir::Type elementType;
   LValueBaseInfo baseInfo;
   const CIRGenBitFieldInfo *bitFieldInfo{nullptr};
@@ -190,6 +191,7 @@ class LValue {
   bool isSimple() const { return lvType == Simple; }
   bool isVectorElt() const { return lvType == VectorElt; }
   bool isBitField() const { return lvType == BitField; }
+  bool isExtVectorElt() const { return lvType == ExtVectorElt; }
   bool isGlobalReg() const { return lvType == GlobalReg; }
   bool isVolatile() const { return quals.hasVolatile(); }
 
@@ -254,6 +256,22 @@ class LValue {
     return vectorIdx;
   }
 
+  // extended vector elements.
+  Address getExtVectorAddress() const {
+    assert(isExtVectorElt());
+    return Address(getExtVectorPointer(), elementType, getAlignment());
+  }
+
+  mlir::Value getExtVectorPointer() const {
+    assert(isExtVectorElt());
+    return v;
+  }
+
+  mlir::ArrayAttr getExtVectorElts() const {
+    assert(isExtVectorElt());
+    return mlir::cast<mlir::ArrayAttr>(vectorElts);
+  }
+
   static LValue makeVectorElt(Address vecAddress, mlir::Value index,
                               clang::QualType t, LValueBaseInfo baseInfo) {
     LValue r;
@@ -265,6 +283,19 @@ class LValue {
     return r;
   }
 
+  static LValue makeExtVectorElt(Address vecAddress, mlir::ArrayAttr elts,
+                                 clang::QualType type,
+                                 LValueBaseInfo baseInfo) {
+    LValue r;
+    r.lvType = ExtVectorElt;
+    r.v = vecAddress.getPointer();
+    r.elementType = vecAddress.getElementType();
+    r.vectorElts = elts;
+    r.initialize(type, type.getQualifiers(), vecAddress.getAlignment(),
+                 baseInfo);
+    return r;
+  }
+
   // bitfield lvalue
   Address getBitFieldAddress() const {
     return Address(getBitFieldPointer(), elementType, getAlignment());
diff --git a/clang/test/CIR/CodeGen/vector-ext-element.cpp b/clang/test/CIR/CodeGen/vector-ext-element.cpp
new file mode 100644
index 0000000000000..de9d53936d2eb
--- /dev/null
+++ b/clang/test/CIR/CodeGen/vector-ext-element.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+typedef int vi4 __attribute__((ext_vector_type(4)));
+
+void element_expr_from_gl() {
+  vi4 a;
+  int x = a.x;
+  int y = a.y;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[X_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+// CIR: %[[Y_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init]
+// CIR: %[[TMP_A:.*]] = cir.load {{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s64i
+// CIR: %[[ELEM_0:.*]] = cir.vec.extract %[[TMP_A]][%[[CONST_0]] : !s64i] : !cir.vector<4 x !s32i>
+// CIR: cir.store {{.*}} %[[ELEM_0]], %[[X_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[TMP_A:.*]] = cir.load {{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s64i
+// CIR: %[[ELEM_1:.*]] = cir.vec.extract %[[TMP_A]][%[[CONST_1]] : !s64i] : !cir.vector<4 x !s32i>
+// CIR: cir.store {{.*}} %[[ELEM_1]], %[[Y_ADDR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[X_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[Y_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: %[[ELEM_0:.*]] = extractelement <4 x i32> %4, i64 0
+// LLVM: store i32 %[[ELEM_0]], ptr %[[X_ADDR]], align 4
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: %[[ELEM_1:.*]] = extractelement <4 x i32> %6, i64 1
+// LLVM: store i32 %[[ELEM_1]], ptr %[[Y_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[X_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[Y_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: %[[ELEM_0:.*]] = extractelement <4 x i32> %[[TMP_A]], i64 0
+// OGCG: store i32 %[[ELEM_0]], ptr %[[X_ADDR]], align 4
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: %[[ELEM_1:.*]] = extractelement <4 x i32> %[[TMP_A]], i64 1
+// OGCG: store i32 %[[ELEM_1]], ptr %[[Y_ADDR]], align 4

From 8d6a1def4d10fcd2a87a2b7d396764f861ed957b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 13 Nov 2025 09:25:53 -0800
Subject: [PATCH 24/25] [SelectionDAGISel] Don't merge input chains if it would
 put a token factor in the way of a glue. (#167805)

In the new test, we're trying to fold a load and a X86ISD::CALL. The
call has a CopyToReg glued to it. The load and the call have different
input chains so they need to be merged. This results in a TokenFactor
that gets put between the CopyToReg and the final CALLm instruction. The
DAG scheduler can't handle that.

The load here was created by legalization of the extract_element using a
stack temporary store and load. A normal IR load would be chained into
call sequence by SelectionDAGBuilder. This would usually have the load
chained in before the CopyToReg. The store/load created by legalization
don't get chained into the rest of the DAG.

Fixes #63790
---
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 16 +++++++++---
 llvm/test/CodeGen/X86/pr63790.ll              | 26 +++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr63790.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e7d4c4b88191b..5bed32db528d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2783,7 +2783,7 @@ void SelectionDAGISel::UpdateChains(
 /// be used as the input node chain for the generated nodes.
 static SDValue
 HandleMergeInputChains(const SmallVectorImpl<SDNode *> &ChainNodesMatched,
-                       SelectionDAG *CurDAG) {
+                       SDValue InputGlue, SelectionDAG *CurDAG) {
 
   SmallPtrSet<const SDNode *, 16> Visited;
   SmallVector<const SDNode *, 8> Worklist;
@@ -2826,8 +2826,16 @@ HandleMergeInputChains(const SmallVectorImpl<SDNode *> &ChainNodesMatched,
   // node that is both the predecessor and successor of the
   // to-be-merged nodes. Fail.
   Visited.clear();
-  for (SDValue V : InputChains)
+  for (SDValue V : InputChains) {
+    // If we need to create a TokenFactor, and any of the input chain nodes will
+    // also be glued to the output, we cannot merge the chains. The TokenFactor
+    // would prevent the glue from being honored.
+    if (InputChains.size() != 1 &&
+        V->getValueType(V->getNumValues() - 1) == MVT::Glue &&
+        InputGlue.getNode() == V.getNode())
+      return SDValue();
     Worklist.push_back(V.getNode());
+  }
 
   for (auto *N : ChainNodesMatched)
     if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true))
@@ -3989,7 +3997,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       }
 
       // Merge the input chains if they are not intra-pattern references.
-      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+      InputChain = HandleMergeInputChains(ChainNodesMatched, InputGlue, CurDAG);
 
       if (!InputChain.getNode())
         break;  // Failed to merge.
@@ -4033,7 +4041,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
         break;
 
       // Merge the input chains if they are not intra-pattern references.
-      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+      InputChain = HandleMergeInputChains(ChainNodesMatched, InputGlue, CurDAG);
 
       if (!InputChain.getNode())
         break;  // Failed to merge.
diff --git a/llvm/test/CodeGen/X86/pr63790.ll b/llvm/test/CodeGen/X86/pr63790.ll
new file mode 100644
index 0000000000000..e4e7a3c536d07
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr63790.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+define void @f(ptr %0, i64 %1) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %BB
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    movaps (%rdi), %xmm0
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $42, %edi
+; CHECK-NEXT:    callq *16(%rsp,%rsi,8)
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    movaps %xmm0, (%rax)
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+BB:
+  %fps = load <2 x ptr>, ptr %0
+  %fp = extractelement <2 x ptr> %fps, i64 %1
+  %p = call ptr %fp(i32 42)
+  store <2 x ptr> %fps, ptr %p
+  ret void
+}

From d1cc1376a08f23eebc74f564782862e19958d786 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Thu, 13 Nov 2025 20:27:35 +0300
Subject: [PATCH 25/25] [CodeGen] Add TRI::regunits() iterating over all
 register units (NFC) (#167901)

---
 llvm/include/llvm/MC/MCRegisterInfo.h | 8 ++++++++
 llvm/lib/CodeGen/LiveIntervals.cpp    | 4 ++--
 llvm/lib/CodeGen/LiveRegUnits.cpp     | 4 ++--
 llvm/lib/CodeGen/MachineVerifier.cpp  | 6 +++---
 llvm/lib/CodeGen/RDFRegisters.cpp     | 4 ++--
 llvm/lib/CodeGen/RegAllocFast.cpp     | 3 +--
 6 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index e6dbb38dfee67..f1caa077a6d7b 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -16,6 +16,7 @@
 #define LLVM_MC_MCREGISTERINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -259,6 +260,9 @@ class LLVM_ABI MCRegisterInfo {
                        iterator_range<MCSuperRegIterator>>
   sub_and_superregs_inclusive(MCRegister Reg) const;
 
+  /// Returns an iterator range over all regunits.
+  iota_range<MCRegUnit> regunits() const;
+
   /// Returns an iterator range over all regunits for \p Reg.
   iterator_range<MCRegUnitIterator> regunits(MCRegister Reg) const;
 
@@ -798,6 +802,10 @@ MCRegisterInfo::sub_and_superregs_inclusive(MCRegister Reg) const {
   return concat<const MCPhysReg>(subregs_inclusive(Reg), superregs(Reg));
 }
 
+inline iota_range<MCRegUnit> MCRegisterInfo::regunits() const {
+  return seq(getNumRegUnits());
+}
+
 inline iterator_range<MCRegUnitIterator>
 MCRegisterInfo::regunits(MCRegister Reg) const {
   return make_range({Reg, this}, MCRegUnitIterator());
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 27c5addffa4ab..b600e0411bc48 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -173,8 +173,8 @@ void LiveIntervals::analyze(MachineFunction &fn) {
   if (EnablePrecomputePhysRegs) {
     // For stress testing, precompute live ranges of all physical register
     // units, including reserved registers.
-    for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
-      getRegUnit(i);
+    for (MCRegUnit Unit : TRI->regunits())
+      getRegUnit(Unit);
   }
 }
 
diff --git a/llvm/lib/CodeGen/LiveRegUnits.cpp b/llvm/lib/CodeGen/LiveRegUnits.cpp
index 0d87062169585..3e7052a9b6245 100644
--- a/llvm/lib/CodeGen/LiveRegUnits.cpp
+++ b/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -20,7 +20,7 @@
 using namespace llvm;
 
 void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) {
-  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+  for (MCRegUnit U : TRI->regunits()) {
     for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
       if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) {
         Units.reset(U);
@@ -31,7 +31,7 @@ void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) {
 }
 
 void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) {
-  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+  for (MCRegUnit U : TRI->regunits()) {
     for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
       if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) {
         Units.set(U);
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 013f52938b65c..a2a66d6128348 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -3564,9 +3564,9 @@ void MachineVerifier::verifyLiveIntervals() {
   }
 
   // Verify all the cached regunit intervals.
-  for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
-    if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
-      verifyLiveRange(*LR, VirtRegOrUnit(i));
+  for (MCRegUnit Unit : TRI->regunits())
+    if (const LiveRange *LR = LiveInts->getCachedRegUnit(Unit))
+      verifyLiveRange(*LR, VirtRegOrUnit(Unit));
 }
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp
index 1400699a607ff..e4b63a3a40805 100644
--- a/llvm/lib/CodeGen/RDFRegisters.cpp
+++ b/llvm/lib/CodeGen/RDFRegisters.cpp
@@ -46,7 +46,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
 
   UnitInfos.resize(TRI.getNumRegUnits());
 
-  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+  for (MCRegUnit U : TRI.regunits()) {
     if (UnitInfos[U].Reg != 0)
       continue;
     MCRegUnitRootIterator R(U, &TRI);
@@ -88,7 +88,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
   }
 
   AliasInfos.resize(TRI.getNumRegUnits());
-  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+  for (MCRegUnit U : TRI.regunits()) {
     BitVector AS(TRI.getNumRegs());
     for (MCRegUnitRootIterator R(U, &TRI); R.isValid(); ++R)
       for (MCPhysReg S : TRI.superregs_inclusive(*R))
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 9097728c84e7e..62d7d2f9eed9a 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -1291,8 +1291,7 @@ bool RegAllocFastImpl::setPhysReg(MachineInstr &MI, MachineOperand &MO,
 #ifndef NDEBUG
 
 void RegAllocFastImpl::dumpState() const {
-  for (unsigned Unit = 1, UnitE = TRI->getNumRegUnits(); Unit != UnitE;
-       ++Unit) {
+  for (MCRegUnit Unit : TRI->regunits()) {
     switch (unsigned VirtReg = RegUnitStates[Unit]) {
     case regFree:
       break;