From 0bd07652524ebacdee166eb609fef48c50769b09 Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@amd.com>
Date: Fri, 17 Jan 2025 09:06:04 +0100
Subject: [PATCH 01/45] EmitC: Allow arrays of size zero (#123292)

This is allowed as a GCC extension, see
https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html.
---
 mlir/docs/Dialects/emitc.md                | 2 ++
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp        | 4 ++--
 mlir/test/Dialect/EmitC/invalid_types.mlir | 8 --------
 mlir/test/Dialect/EmitC/types.mlir         | 4 +++-
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/mlir/docs/Dialects/emitc.md b/mlir/docs/Dialects/emitc.md
index 743d70959f3d8..e2288f518dae1 100644
--- a/mlir/docs/Dialects/emitc.md
+++ b/mlir/docs/Dialects/emitc.md
@@ -16,6 +16,8 @@ The following convention is followed:
     floating types.
 *   If `__bf16` is used, the code requires a compiler that supports it, such as 
     GCC or Clang.
+*   If `emitc.array` with a dimension of size zero is used, then the code
+    requires [a GCC extension](https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html).
 *   Else the generated code is compatible with C99.
 
 These restrictions are neither inherent to the EmitC dialect itself nor to the
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index fdc21d6c6e24b..c818dd18a3d24 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -971,8 +971,8 @@ LogicalResult emitc::ArrayType::verify(
     return emitError() << "shape must not be empty";
 
   for (int64_t dim : shape) {
-    if (dim <= 0)
-      return emitError() << "dimensions must have positive size";
+    if (dim < 0)
+      return emitError() << "dimensions must have non-negative size";
   }
 
   if (!elementType)
diff --git a/mlir/test/Dialect/EmitC/invalid_types.mlir b/mlir/test/Dialect/EmitC/invalid_types.mlir
index 302a345c7c4f4..c39a881ff26ad 100644
--- a/mlir/test/Dialect/EmitC/invalid_types.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_types.mlir
@@ -36,14 +36,6 @@ func.func @illegal_array_missing_x(
 
 // -----
 
-func.func @illegal_array_non_positive_dimenson(
-    // expected-error @+1 {{dimensions must have positive size}}
-    %arg0: !emitc.array<0xi32>
-) {
-}
-
-// -----
-
 func.func @illegal_array_missing_type(
     // expected-error @+1 {{expected non-function type}}
     %arg0: !emitc.array<10x>
diff --git a/mlir/test/Dialect/EmitC/types.mlir b/mlir/test/Dialect/EmitC/types.mlir
index e3462bffc5b0d..d4dd94457f39b 100644
--- a/mlir/test/Dialect/EmitC/types.mlir
+++ b/mlir/test/Dialect/EmitC/types.mlir
@@ -17,7 +17,9 @@ func.func @array_types(
   // CHECK-SAME: !emitc.array<30x!emitc.ssize_t>
   %arg5: !emitc.array<30x!emitc.ssize_t>,
   // CHECK-SAME: !emitc.array<30x!emitc.ptrdiff_t>
-  %arg6: !emitc.array<30x!emitc.ptrdiff_t>
+  %arg6: !emitc.array<30x!emitc.ptrdiff_t>,
+  // CHECK-SAME: !emitc.array<0xi64>
+  %arg7: !emitc.array<0xi64>
 ) {
   return
 }

From 1274bca2ad5befe56d82ef76100e2c294ca57ce2 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 17 Jan 2025 16:06:31 +0800
Subject: [PATCH 02/45] [X86][APX] Support APX + MOVRS (#123264)

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/784266
---
 llvm/lib/Target/X86/X86InstrAVX10.td     |  4 +-
 llvm/lib/Target/X86/X86InstrMisc.td      | 21 ++++-
 llvm/test/CodeGen/X86/movrs-builtins.ll  | 21 +++++
 llvm/test/MC/Disassembler/X86/movrs.txt  | 98 +++++++++++++++++++++++-
 llvm/test/MC/X86/movrs-att-64.s          | 98 +++++++++++++++++++++++-
 llvm/test/MC/X86/movrs-intel-64.s        | 98 +++++++++++++++++++++++-
 llvm/test/TableGen/x86-instr-mapping.inc |  4 +
 7 files changed, 337 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 127016184bc17..edbcb17297603 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -1767,9 +1767,9 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> {
 }
 
 multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> {
-  let Predicates = [HasMOVRS, HasAVX10_2_512] in
+  let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in
     defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
-  let Predicates = [HasMOVRS, HasAVX10_2] in {
+  let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in {
     defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128;
     defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256;
   }
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 9fabe2acf0019..43c02c4f85844 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1733,7 +1733,7 @@ def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
 //
 
 let SchedRW = [WriteLoad] in {
-let Predicates = [HasMOVRS, NoEGPR] in {
+let Predicates = [HasMOVRS, NoEGPR, In64BitMode] in {
 def MOVRS8rm     : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
                    "movrs{b}\t{$src, $dst|$dst, $src}",
                    [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, T8;
@@ -1746,8 +1746,25 @@ def MOVRS32rm    : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 def MOVRS64rm    : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                    "movrs{q}\t{$src, $dst|$dst, $src}",
                    [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, T8;
+}
+
+let Predicates = [HasMOVRS] in
 def PREFETCHRST2 : I<0x18, MRM4m, (outs), (ins i8mem:$src),
                    "prefetchrst2\t$src",
                    [(int_x86_prefetchrs addr:$src)]>, TB;
+
+let Predicates = [HasMOVRS, HasEGPR, In64BitMode] in {
+def MOVRS8rm_EVEX  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                   "movrs{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, EVEX, NoCD8, T_MAP4;
+def MOVRS16rm_EVEX : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "movrs{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (int_x86_movrshi addr:$src))]>, EVEX, NoCD8, PD, T_MAP4;
+def MOVRS32rm_EVEX : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movrs{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (int_x86_movrssi addr:$src))]>, EVEX, NoCD8, T_MAP4;
+def MOVRS64rm_EVEX : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                   "movrs{q}\t{$src, $dst|$dst, $src}",
+                   [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, EVEX, NoCD8, T_MAP4, REX_W;
+}
 }
-}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/movrs-builtins.ll b/llvm/test/CodeGen/X86/movrs-builtins.ll
index c1722c831c95d..ccf0833e53990 100644
--- a/llvm/test/CodeGen/X86/movrs-builtins.ll
+++ b/llvm/test/CodeGen/X86/movrs-builtins.ll
@@ -1,11 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs,+egpr | FileCheck %s --check-prefix=EGPR
 
 define i8 @test_movrs_si8(ptr %__A) {
 ; CHECK-LABEL: test_movrs_si8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsb (%rdi), %al # encoding: [0x0f,0x38,0x8a,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si8:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsb (%rdi), %al # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8a,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i8 @llvm.x86.movrsqi(ptr %__A)
   ret i8 %0
@@ -17,6 +23,11 @@ define i16 @test_movrs_si16(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsw (%rdi), %ax # encoding: [0x66,0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si16:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsw (%rdi), %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i16 @llvm.x86.movrshi(ptr %__A)
   ret i16 %0
@@ -28,6 +39,11 @@ define i32 @test_movrs_si32(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsl (%rdi), %eax # encoding: [0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsl (%rdi), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i32 @llvm.x86.movrssi(ptr %__A)
   ret i32 %0
@@ -39,6 +55,11 @@ define i64 @test_movrs_si64(ptr %__A) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movrsq (%rdi), %rax # encoding: [0x48,0x0f,0x38,0x8b,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_movrs_si64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movrsq (%rdi), %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0x8b,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = call i64 @llvm.x86.movrsdi(ptr %__A)
   ret i64 %0
diff --git a/llvm/test/MC/Disassembler/X86/movrs.txt b/llvm/test/MC/Disassembler/X86/movrs.txt
index fa91b542d3f73..caac8bc8b7b30 100644
--- a/llvm/test/MC/Disassembler/X86/movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/movrs.txt
@@ -95,4 +95,100 @@
 
 # ATT:   movrsq  -128(%rdx), %rbx
 # INTEL: movrs rbx, qword ptr [rdx - 128]
-0x48,0x0f,0x38,0x8b,0x5a,0x80
\ No newline at end of file
+0x48,0x0f,0x38,0x8b,0x5a,0x80
+
+# ATT:   movrsb  268435456(%rbp,%r14,8), %r16b
+# INTEL: movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsb  291(%r17,%rax,4), %bl
+# INTEL: movrs bl, byte ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsb  (%rip), %bl
+# INTEL: movrs bl, byte ptr [rip]
+0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsb  -32(,%rbp,2), %r18b
+# INTEL: movrs r18b, byte ptr [2*rbp - 32]
+0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsb  127(%r19), %bl
+# INTEL: movrs bl, byte ptr [r19 + 127]
+0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f
+
+# ATT:   movrsb  -128(%r20,%riz), %bl
+# INTEL: movrs bl, byte ptr [r20 + riz - 128]
+0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80
+
+# ATT:   movrsw  268435456(%rbp,%r14,8), %r16w
+# INTEL: movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsw  291(%r17,%rax,4), %bx
+# INTEL: movrs bx, word ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsw  (%rip), %bx
+# INTEL: movrs bx, word ptr [rip]
+0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsw  -32(,%rbp,2), %r18w
+# INTEL: movrs r18w, word ptr [2*rbp - 32]
+0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsw  127(%r19), %bx
+# INTEL: movrs bx, word ptr [r19 + 127]
+0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsw  -128(%r20,%riz), %bx
+# INTEL: movrs bx, word ptr [r20 + riz - 128]
+0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80
+
+# ATT:   movrsl  268435456(%rbp,%r14,8), %r16d
+# INTEL: movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsl  291(%r17,%rax,4), %ebx
+# INTEL: movrs ebx, dword ptr [r17 + 4*rax + 291]
+0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsl  (%rip), %ebx
+# INTEL: movrs ebx, dword ptr [rip]
+0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsl  -32(,%rbp,2), %r18d
+# INTEL: movrs r18d, dword ptr [2*rbp - 32]
+0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsl  127(%r19), %ebx
+# INTEL: movrs ebx, dword ptr [r19 + 127]
+0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsl  -128(%r20,%riz), %ebx
+# INTEL: movrs ebx, dword ptr [r20 + riz - 128]
+0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80
+
+# ATT:   movrsq  268435456(%rbp,%r14,8), %r16
+# INTEL: movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   movrsq  291(%r17,%rax,4), %rbx
+# INTEL: movrs rbx, qword ptr [r17 + 4*rax + 291]
+0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00
+
+# ATT:   movrsq  (%rip), %rbx
+# INTEL: movrs rbx, qword ptr [rip]
+0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00
+
+# ATT:   movrsq  -32(,%rbp,2), %r18
+# INTEL: movrs r18, qword ptr [2*rbp - 32]
+0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   movrsq  127(%r19), %rbx
+# INTEL: movrs rbx, qword ptr [r19 + 127]
+0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f
+
+# ATT:   movrsq  -128(%r20,%riz), %rbx
+# INTEL: movrs rbx, qword ptr [r20 + riz - 128]
+0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80
diff --git a/llvm/test/MC/X86/movrs-att-64.s b/llvm/test/MC/X86/movrs-att-64.s
index 59a2fdb6d10b2..e951b30369d46 100644
--- a/llvm/test/MC/X86/movrs-att-64.s
+++ b/llvm/test/MC/X86/movrs-att-64.s
@@ -94,4 +94,100 @@
 
 // CHECK: movrsq  -128(%rdx), %rbx
 // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80]
-          movrs  -128(%rdx), %rbx
\ No newline at end of file
+          movrs  -128(%rdx), %rbx
+
+// CHECK: movrsb  268435456(%rbp,%r14,8), %r16b
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16b
+
+// CHECK: movrsb  291(%r17,%rax,4), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %bl
+
+// CHECK: {evex}  movrsb  (%rip), %bl
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %bl
+
+// CHECK: movrsb  -32(,%rbp,2), %r18b
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18b
+
+// CHECK: movrsb  127(%r19), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f]
+          movrs 127(%r19), %bl
+
+// CHECK: movrsb  -128(%r20), %bl
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80]
+          movrs  -128(%r20), %bl
+
+// CHECK: movrsw  268435456(%rbp,%r14,8), %r16w
+// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16w
+
+// CHECK: movrsw  291(%r17,%rax,4), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %bx
+
+// CHECK: {evex}  movrsw  (%rip), %bx
+// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %bx
+
+// CHECK: movrsw  -32(,%rbp,2), %r18w
+// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18w
+
+// CHECK: movrsw  127(%r19), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %bx
+
+// CHECK: movrsw  -128(%r20), %bx
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %bx
+
+// CHECK: movrsl  268435456(%rbp,%r14,8), %r16d
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16d
+
+// CHECK: movrsl  291(%r17,%rax,4), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %ebx
+
+// CHECK: {evex}  movrsl  (%rip), %ebx
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %ebx
+
+// CHECK: movrsl  -32(,%rbp,2), %r18d
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18d
+
+// CHECK: movrsl  127(%r19), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %ebx
+
+// CHECK: movrsl  -128(%r20), %ebx
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %ebx
+
+// CHECK: movrsq  268435456(%rbp,%r14,8), %r16
+// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs  268435456(%rbp,%r14,8), %r16
+
+// CHECK: movrsq  291(%r17,%rax,4), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs  291(%r17,%rax,4), %rbx
+
+// CHECK: {evex}  movrsq  (%rip), %rbx
+// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs  (%rip), %rbx
+
+// CHECK: movrsq  -32(,%rbp,2), %r18
+// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs  -32(,%rbp,2), %r18
+
+// CHECK: movrsq  127(%r19), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f]
+          movrs 127(%r19), %rbx
+
+// CHECK: movrsq  -128(%r20), %rbx
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80]
+          movrs  -128(%r20), %rbx
diff --git a/llvm/test/MC/X86/movrs-intel-64.s b/llvm/test/MC/X86/movrs-intel-64.s
index f41075a21b3e8..f698f1c440442 100644
--- a/llvm/test/MC/X86/movrs-intel-64.s
+++ b/llvm/test/MC/X86/movrs-intel-64.s
@@ -94,4 +94,100 @@
 
 // CHECK: movrs rbx, qword ptr [rdx - 128]
 // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80]
-          movrs rbx, qword ptr [rdx - 128]
\ No newline at end of file
+          movrs rbx, qword ptr [rdx - 128]
+
+// CHECK: movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16b, byte ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs bl, byte ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs bl, byte ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs bl, byte ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs bl, byte ptr [rip]
+
+// CHECK: movrs r18b, byte ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18b, byte ptr [2*rbp - 32]
+
+// CHECK: movrs bl, byte ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f]
+          movrs bl, byte ptr [r19 + 127]
+
+// CHECK: movrs bl, byte ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80]
+          movrs bl, byte ptr [r20 - 128]
+
+// CHECK: movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16w, word ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs bx, word ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs bx, word ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs bx, word ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs bx, word ptr [rip]
+
+// CHECK: movrs r18w, word ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18w, word ptr [2*rbp - 32]
+
+// CHECK: movrs bx, word ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f]
+          movrs bx, word ptr [r19 + 127]
+
+// CHECK: movrs bx, word ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80]
+          movrs bx, word ptr [r20 - 128]
+
+// CHECK: movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16d, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs ebx, dword ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs ebx, dword ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs ebx, dword ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs ebx, dword ptr [rip]
+
+// CHECK: movrs r18d, dword ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18d, dword ptr [2*rbp - 32]
+
+// CHECK: movrs ebx, dword ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f]
+          movrs ebx, dword ptr [r19 + 127]
+
+// CHECK: movrs ebx, dword ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80]
+          movrs ebx, dword ptr [r20 - 128]
+
+// CHECK: movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10]
+          movrs r16, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: movrs rbx, qword ptr [r17 + 4*rax + 291]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00]
+          movrs rbx, qword ptr [r17 + 4*rax + 291]
+
+// CHECK: {evex} movrs rbx, qword ptr [rip]
+// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00]
+          {evex} movrs rbx, qword ptr [rip]
+
+// CHECK: movrs r18, qword ptr [2*rbp - 32]
+// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          movrs r18, qword ptr [2*rbp - 32]
+
+// CHECK: movrs rbx, qword ptr [r19 + 127]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f]
+          movrs rbx, qword ptr [r19 + 127]
+
+// CHECK: movrs rbx, qword ptr [r20 - 128]
+// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80]
+          movrs rbx, qword ptr [r20 - 128]
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index ed43684db2dfc..55d392f5e271f 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -133,6 +133,10 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::MOVDIR64B64_EVEX, X86::MOVDIR64B64 },
   { X86::MOVDIRI32_EVEX, X86::MOVDIRI32 },
   { X86::MOVDIRI64_EVEX, X86::MOVDIRI64 },
+  { X86::MOVRS16rm_EVEX, X86::MOVRS16rm },
+  { X86::MOVRS32rm_EVEX, X86::MOVRS32rm },
+  { X86::MOVRS64rm_EVEX, X86::MOVRS64rm },
+  { X86::MOVRS8rm_EVEX, X86::MOVRS8rm },
   { X86::MULX32rm_EVEX, X86::MULX32rm },
   { X86::MULX32rr_EVEX, X86::MULX32rr },
   { X86::MULX64rm_EVEX, X86::MULX64rm },

From c3ba6f378ef80d750e2278560c6f95a300114412 Mon Sep 17 00:00:00 2001
From: Viktoriia Bakalova <115406782+VitaNuo@users.noreply.github.com>
Date: Fri, 17 Jan 2025 09:10:58 +0100
Subject: [PATCH 03/45] =?UTF-8?q?[Modules]=20Delay=20deserialization=20of?=
 =?UTF-8?q?=20preferred=5Fname=20attribute=20at=20r=E2=80=A6=20(#122726)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ecord level.

This fixes the incorrect diagnostic emitted when compiling the following
snippet

```
// string_view.h
template<class _CharT>
class basic_string_view;

typedef basic_string_view<char> string_view;

template<class _CharT>
class
__attribute__((__preferred_name__(string_view)))
basic_string_view {
public:
    basic_string_view()
    {
    }
};

inline basic_string_view<char> foo()
{
  return basic_string_view<char>();
}
// A.cppm
module;
#include "string_view.h"
export module A;

// Use.cppm
module;
#include "string_view.h"
export module Use;
import A;
```

The diagnostic is
```
string_view.h:11:5: error: 'basic_string_view<char>::basic_string_view' from module 'A.<global>' is not present in definition of 'string_view' provided earlier
```

The underlying issue is that deserialization of the `preferred_name`
attribute triggers deserialization of `basic_string_view<char>`, which
triggers the deserialization of the `preferred_name` attribute again
(since it's attached to the `basic_string_view` template).
The deserialization logic is implemented in a way that prevents it from
going on a loop in a literal sense (it detects early on that it has
already seen the `string_view` typedef when trying to start its
deserialization for the second time), but leaves the typedef
deserialization in an unfinished state. Subsequently, the `string_view`
typedef from the deserialized module cannot be merged with the same
typedef from `string_view.h`, resulting in the above diagnostic.

This PR resolves the problem by delaying the deserialization of the
`preferred_name` attribute until the deserialization of the
`basic_string_view` template is completed. As a result of deferring, the
deserialization of the `preferred_name` attribute doesn't need to go on
a loop since the type of the `string_view` typedef is already known when
it's deserialized.
---
 clang/include/clang/AST/Attr.h                | 14 +++-
 clang/include/clang/Basic/Attr.td             | 11 +++
 clang/include/clang/Serialization/ASTReader.h | 19 +++++
 .../clang/Serialization/ASTRecordReader.h     | 13 ++-
 clang/lib/Serialization/ASTReader.cpp         |  5 ++
 clang/lib/Serialization/ASTReaderDecl.cpp     | 79 ++++++++++++++++++-
 clang/lib/Serialization/ASTWriter.cpp         | 16 ++--
 clang/test/Modules/preferred_name.cppm        | 12 ++-
 clang/utils/TableGen/ClangAttrEmitter.cpp     |  4 +
 9 files changed, 156 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index 3365ebe4d9012..bed532a84a1bd 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -60,6 +60,8 @@ class Attr : public AttributeCommonInfo {
   unsigned IsLateParsed : 1;
   LLVM_PREFERRED_TYPE(bool)
   unsigned InheritEvenIfAlreadyPresent : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned DeferDeserialization : 1;
 
   void *operator new(size_t bytes) noexcept {
     llvm_unreachable("Attrs cannot be allocated with regular 'new'.");
@@ -80,10 +82,11 @@ class Attr : public AttributeCommonInfo {
 
 protected:
   Attr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
-       attr::Kind AK, bool IsLateParsed)
+       attr::Kind AK, bool IsLateParsed, bool DeferDeserialization = false)
       : AttributeCommonInfo(CommonInfo), AttrKind(AK), Inherited(false),
         IsPackExpansion(false), Implicit(false), IsLateParsed(IsLateParsed),
-        InheritEvenIfAlreadyPresent(false) {}
+        InheritEvenIfAlreadyPresent(false),
+        DeferDeserialization(DeferDeserialization) {}
 
 public:
   attr::Kind getKind() const { return static_cast<attr::Kind>(AttrKind); }
@@ -105,6 +108,8 @@ class Attr : public AttributeCommonInfo {
   void setPackExpansion(bool PE) { IsPackExpansion = PE; }
   bool isPackExpansion() const { return IsPackExpansion; }
 
+  bool shouldDeferDeserialization() const { return DeferDeserialization; }
+
   // Clone this attribute.
   Attr *clone(ASTContext &C) const;
 
@@ -146,8 +151,9 @@ class InheritableAttr : public Attr {
 protected:
   InheritableAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
                   attr::Kind AK, bool IsLateParsed,
-                  bool InheritEvenIfAlreadyPresent)
-      : Attr(Context, CommonInfo, AK, IsLateParsed) {
+                  bool InheritEvenIfAlreadyPresent,
+                  bool DeferDeserialization = false)
+      : Attr(Context, CommonInfo, AK, IsLateParsed, DeferDeserialization) {
     this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent;
   }
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 408d3adf370c8..3969dd8af5dfa 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -713,6 +713,12 @@ class Attr {
   // attribute may be documented under multiple categories, more than one
   // Documentation entry may be listed.
   list<Documentation> Documentation;
+  // Set to true if deserialization of this attribute must be deferred until 
+  // the parent Decl is fully deserialized (during header module file
+  // deserialization). E.g., this is the case for the preferred_name attribute,
+  // since its type deserialization depends on its target Decl type.
+  // (See https://github.com/llvm/llvm-project/issues/56490 for details).
+  bit DeferDeserialization = 0;
 }
 
 /// Used to define a set of mutually exclusive attributes.
@@ -3254,6 +3260,11 @@ def PreferredName : InheritableAttr {
   let InheritEvenIfAlreadyPresent = 1;
   let MeaningfulToClassTemplateDefinition = 1;
   let TemplateDependent = 1;
+  // Type of this attribute depends on the target Decl type.
+  // Therefore, its deserialization must be deferred until
+  // deserialization of the target Decl is complete
+  // (for header modules).
+  let DeferDeserialization = 1;
 }
 
 def PreserveMost : DeclOrTypeAttr {
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index d77bb01c5aa59..c839215dc4077 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1221,6 +1221,24 @@ class ASTReader
   /// been completed.
   std::deque<PendingDeclContextInfo> PendingDeclContextInfos;
 
+  /// Deserialization of some attributes must be deferred since they refer
+  /// to themselves in their type (e.g., preferred_name attribute refers to the
+  /// typedef that refers back to the template specialization of the template
+  /// that the attribute is attached to).
+  /// More attributes that store TypeSourceInfo might be potentially affected,
+  /// see https://github.com/llvm/llvm-project/issues/56490 for details.
+  struct DeferredAttribute {
+    // Index of the deferred attribute in the Record of the TargetedDecl.
+    uint64_t RecordIdx;
+    // Decl to attach a deferred attribute to.
+    Decl *TargetedDecl;
+  };
+
+  /// The collection of Decls that have been loaded but some of their attributes
+  /// have been deferred, paired with the index inside the record pointing
+  /// at the skipped attribute.
+  SmallVector<DeferredAttribute> PendingDeferredAttributes;
+
   template <typename DeclTy>
   using DuplicateObjCDecls = std::pair<DeclTy *, DeclTy *>;
 
@@ -1570,6 +1588,7 @@ class ASTReader
   void loadPendingDeclChain(Decl *D, uint64_t LocalOffset);
   void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D,
                           unsigned PreviousGeneration = 0);
+  void loadDeferredAttribute(const DeferredAttribute &DA);
 
   RecordLocation getLocalBitOffset(uint64_t GlobalOffset);
   uint64_t getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset);
diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h
index 2561418b78ca7..a29972fcf73a8 100644
--- a/clang/include/clang/Serialization/ASTRecordReader.h
+++ b/clang/include/clang/Serialization/ASTRecordReader.h
@@ -83,6 +83,12 @@ class ASTRecordReader
   /// Returns the current value in this record, without advancing.
   uint64_t peekInt() { return Record[Idx]; }
 
+  /// Returns the next N values in this record, without advancing.
+  uint64_t peekInts(unsigned N) { return Record[Idx + N]; }
+
+  /// Skips the current value.
+  void skipInt() { Idx += 1; }
+
   /// Skips the specified number of values.
   void skipInts(unsigned N) { Idx += N; }
 
@@ -335,7 +341,12 @@ class ASTRecordReader
   Attr *readAttr();
 
   /// Reads attributes from the current stream position, advancing Idx.
-  void readAttributes(AttrVec &Attrs);
+  /// For some attributes (where type depends on itself recursively), defer
+  /// reading the attribute until the type has been read.
+  void readAttributes(AttrVec &Attrs, Decl *D = nullptr);
+
+  /// Reads one attribute from the current stream position, advancing Idx.
+  Attr *readOrDeferAttrFor(Decl *D);
 
   /// Read an BTFTypeTagAttr object.
   BTFTypeTagAttr *readBTFTypeTagAttr() {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 202227b195585..d08dc6b1b4d93 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -10180,6 +10180,11 @@ void ASTReader::finishPendingActions() {
     }
     PendingDeducedVarTypes.clear();
 
+    // Load the delayed preferred name attributes.
+    for (unsigned I = 0; I != PendingDeferredAttributes.size(); ++I)
+      loadDeferredAttribute(PendingDeferredAttributes[I]);
+    PendingDeferredAttributes.clear();
+
     // For each decl chain that we wanted to complete while deserializing, mark
     // it as "still needs to be completed".
     for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) {
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 1c51a7b5e460f..06dff02ac6128 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -612,7 +612,7 @@ void ASTDeclReader::VisitDecl(Decl *D) {
 
   if (HasAttrs) {
     AttrVec Attrs;
-    Record.readAttributes(Attrs);
+    Record.readAttributes(Attrs, D);
     // Avoid calling setAttrs() directly because it uses Decl::getASTContext()
     // internally which is unsafe during derialization.
     D->setAttrsImpl(Attrs, Reader.getContext());
@@ -3093,6 +3093,8 @@ class AttrReader {
     return Reader.readInt();
   }
 
+  uint64_t peekInts(unsigned N) { return Reader.peekInts(N); }
+
   bool readBool() { return Reader.readBool(); }
 
   SourceRange readSourceRange() {
@@ -3123,18 +3125,29 @@ class AttrReader {
     return Reader.readVersionTuple();
   }
 
+  void skipInt() { Reader.skipInts(1); }
+
+  void skipInts(unsigned N) { Reader.skipInts(N); }
+
+  unsigned getCurrentIdx() { return Reader.getIdx(); }
+
   OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); }
 
   template <typename T> T *readDeclAs() { return Reader.readDeclAs<T>(); }
 };
 }
 
+/// Reads one attribute from the current stream position, advancing Idx.
 Attr *ASTRecordReader::readAttr() {
   AttrReader Record(*this);
   auto V = Record.readInt();
   if (!V)
     return nullptr;
 
+  // Read and ignore the skip count, since attribute deserialization is not
+  // deferred on this pass.
+  Record.skipInt();
+
   Attr *New = nullptr;
   // Kind is stored as a 1-based integer because 0 is used to indicate a null
   // Attr pointer.
@@ -3164,13 +3177,28 @@ Attr *ASTRecordReader::readAttr() {
   return New;
 }
 
-/// Reads attributes from the current stream position.
-void ASTRecordReader::readAttributes(AttrVec &Attrs) {
+/// Reads attributes from the current stream position, advancing Idx.
+/// For some attributes (where type depends on itself recursively), defer
+/// reading the attribute until the type has been read.
+void ASTRecordReader::readAttributes(AttrVec &Attrs, Decl *D) {
   for (unsigned I = 0, E = readInt(); I != E; ++I)
-    if (auto *A = readAttr())
+    if (auto *A = readOrDeferAttrFor(D))
       Attrs.push_back(A);
 }
 
+/// Reads one attribute from the current stream position, advancing Idx.
+/// For some attributes (where type depends on itself recursively), defer
+/// reading the attribute until the type has been read.
+Attr *ASTRecordReader::readOrDeferAttrFor(Decl *D) {
+  AttrReader Record(*this);
+  unsigned SkipCount = Record.peekInts(1);
+  if (!SkipCount)
+    return readAttr();
+  Reader->PendingDeferredAttributes.push_back({Record.getCurrentIdx(), D});
+  Record.skipInts(SkipCount);
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ASTReader Implementation
 //===----------------------------------------------------------------------===//
@@ -4459,6 +4487,49 @@ void ASTReader::loadPendingDeclChain(Decl *FirstLocal, uint64_t LocalOffset) {
   ASTDeclReader::attachLatestDecl(CanonDecl, MostRecent);
 }
 
+void ASTReader::loadDeferredAttribute(const DeferredAttribute &DA) {
+  Decl *D = DA.TargetedDecl;
+  ModuleFile *M = getOwningModuleFile(D);
+
+  unsigned LocalDeclIndex = D->getGlobalID().getLocalDeclIndex();
+  const DeclOffset &DOffs = M->DeclOffsets[LocalDeclIndex];
+  RecordLocation Loc(M, DOffs.getBitOffset(M->DeclsBlockStartOffset));
+
+  llvm::BitstreamCursor &Cursor = Loc.F->DeclsCursor;
+  SavedStreamPosition SavedPosition(Cursor);
+  if (llvm::Error Err = Cursor.JumpToBit(Loc.Offset)) {
+    Error(std::move(Err));
+  }
+
+  Expected<unsigned> MaybeCode = Cursor.ReadCode();
+  if (!MaybeCode) {
+    llvm::report_fatal_error(
+        Twine("ASTReader::loadPreferredNameAttribute failed reading code: ") +
+        toString(MaybeCode.takeError()));
+  }
+  unsigned Code = MaybeCode.get();
+
+  ASTRecordReader Record(*this, *Loc.F);
+  Expected<unsigned> MaybeRecCode = Record.readRecord(Cursor, Code);
+  if (!MaybeRecCode) {
+    llvm::report_fatal_error(
+        Twine(
+            "ASTReader::loadPreferredNameAttribute failed reading rec code: ") +
+        toString(MaybeCode.takeError()));
+  }
+  unsigned RecCode = MaybeRecCode.get();
+  if (RecCode < DECL_TYPEDEF || RecCode > DECL_LAST) {
+    llvm::report_fatal_error(
+        Twine("ASTReader::loadPreferredNameAttribute failed reading rec code: "
+              "expected valid DeclCode") +
+        toString(MaybeCode.takeError()));
+  }
+
+  Record.skipInts(DA.RecordIdx);
+  Attr *A = Record.readAttr();
+  getContext().getDeclAttrs(D).push_back(A);
+}
+
 namespace {
 
   /// Given an ObjC interface, goes through the modules and links to the
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 55d3c2bb56f2c..1c4f5730df312 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -37,6 +37,7 @@
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeLocVisitor.h"
+#include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileEntry.h"
@@ -5067,15 +5068,14 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef,
 
 void ASTRecordWriter::AddAttr(const Attr *A) {
   auto &Record = *this;
-  // FIXME: Clang can't handle the serialization/deserialization of
-  // preferred_name properly now. See
-  // https://github.com/llvm/llvm-project/issues/56490 for example.
-  if (!A || (isa<PreferredNameAttr>(A) &&
-             Writer->isWritingStdCXXNamedModules()))
+  if (!A)
     return Record.push_back(0);
 
   Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs
 
+  auto SkipIdx = Record.size();
+  // Add placeholder for the size of deferred attribute.
+  Record.push_back(0);
   Record.AddIdentifierRef(A->getAttrName());
   Record.AddIdentifierRef(A->getScopeName());
   Record.AddSourceRange(A->getRange());
@@ -5086,6 +5086,12 @@ void ASTRecordWriter::AddAttr(const Attr *A) {
   Record.push_back(A->isRegularKeywordAttribute());
 
 #include "clang/Serialization/AttrPCHWrite.inc"
+
+  if (A->shouldDeferDeserialization()) {
+    // Record the actual size of deferred attribute (+ 1 to count the attribute
+    // kind).
+    Record[SkipIdx] = Record.size() - SkipIdx + 1;
+  }
 }
 
 /// Emit the list of attributes to the specified record.
diff --git a/clang/test/Modules/preferred_name.cppm b/clang/test/Modules/preferred_name.cppm
index 806781a81c5ca..86ba6ae96db99 100644
--- a/clang/test/Modules/preferred_name.cppm
+++ b/clang/test/Modules/preferred_name.cppm
@@ -53,10 +53,16 @@ import A;
 export using ::foo_templ;
 
 //--- Use1.cpp
-import A;         // expected-warning@foo.h:8 {{attribute declaration must precede definition}}
-#include "foo.h"  // expected-note@foo.h:9 {{previous definition is here}}
-
+// expected-no-diagnostics
+import A;
+#include "foo.h"
 //--- Use2.cpp
 // expected-no-diagnostics
 #include "foo.h"
 import A;
+
+//--- Use3.cpp
+#include "foo.h"
+import A;
+foo test;
+int size = test.size(); // expected-error {{no member named 'size' in 'foo'}}
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index cc6a8eaebd44e..41730eba32ce2 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3043,6 +3043,10 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS,
            << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true"
                                                               : "false");
       }
+      if (R.getValueAsBit("DeferDeserialization")) {
+        OS << ", "
+           << "/*DeferDeserialization=*/true";
+      }
       OS << ")\n";
 
       for (auto const &ai : Args) {

From 90a05f32166c4a45224a5eedbec9c5c7e21d2dbf Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 17 Jan 2025 09:26:49 +0100
Subject: [PATCH 04/45] [openmp] Support CET in z_Linux_asm.S (#123213)

When libomp is built with -cf-protection, add endbr instructions to the
start of functions for Intel CET support.
---
 openmp/runtime/src/z_Linux_asm.S | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index cc5344cdd124a..0bf9f07a13f14 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -19,6 +19,16 @@
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
+# if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+# if __has_include(<cet.h>)
+# include <cet.h>
+# endif
+# endif
+
+# if !defined(_CET_ENDBR)
+# define _CET_ENDBR
+# endif
+
 # if KMP_MIC
 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
 // The delay operation has the effect of removing the current thread from
@@ -66,6 +76,7 @@
 	ALIGN  4
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
+	_CET_ENDBR
 .endmacro
 # else // KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
@@ -92,6 +103,7 @@ KMP_PREFIX_UNDERSCORE($0):
         .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_startproc
+	_CET_ENDBR
 .endm
 .macro KMP_CFI_DEF_OFFSET sz
 	.cfi_def_cfa_offset	\sz

From 3c42a774569ee06fb02ce00e2d2d2ce517c894f3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 17 Jan 2025 09:38:00 +0100
Subject: [PATCH 05/45] [BOLT] Fix handling of LLVM_LIBDIR_SUFFIX (#122874)

This fixes a number of issues introduced in #97130 when
LLVM_LIBDIR_SUFFIX is a non-empty string. Make sure that the libdir is
always referenced as `lib${LLVM_LIBDIR_SUFFIX}`, not as just `lib` or
`${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}`.

This is the standard libdir convention for all LLVM subprojects. Using
`${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}` would result in a
duplicate suffix.
---
 bolt/CMakeLists.txt         |  4 ++--
 bolt/runtime/CMakeLists.txt | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 9ac196ad0e821..04db160b64b05 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -163,8 +163,8 @@ if (BOLT_ENABLE_RUNTIME)
   add_llvm_install_targets(install-bolt_rt
     DEPENDS bolt_rt bolt
     COMPONENT bolt)
-  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a")
-  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a")
+  set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a")
+  set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a")
 endif()
 
 find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld")
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 40f4fbc9f30d5..0deb69a27d435 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -16,18 +16,18 @@ add_library(bolt_rt_instr STATIC
   instr.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
 add_library(bolt_rt_hugify STATIC
   hugify.cpp
   ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
 
 if(NOT BOLT_BUILT_STANDALONE)
   add_custom_command(TARGET bolt_rt_instr POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}")
   add_custom_command(TARGET bolt_rt_hugify POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
+    COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}")
 endif()
 
 set(BOLT_RT_FLAGS
@@ -53,23 +53,23 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS})
 target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
-install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
-install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
+install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
 
 if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   add_library(bolt_rt_instr_osx STATIC
     instr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/config.h
   )
-  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+  set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}")
   target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_compile_options(bolt_rt_instr_osx PRIVATE
     -target x86_64-apple-darwin19.6.0
     ${BOLT_RT_FLAGS})
-  install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}")
+  install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}")
 
   if(NOT BOLT_BUILT_STANDALONE)
     add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
+      COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}")
   endif()
 endif()

From c8ba551da17c48e00c0eeb572e7667ffa5109f6f Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Fri, 17 Jan 2025 08:41:33 +0000
Subject: [PATCH 06/45] [AArch64] Return early rather than asserting when Size
 of value passed to targetShrinkDemandedConstant is not 32 or 64 (#123084)

See https://github.com/llvm/llvm-project/issues/123029 for details.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  5 +-
 .../half-precision-signof-no-assert.ll        | 48 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d4a114c275fb7..7d3ca46204b67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2373,8 +2373,9 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
     return false;
 
   unsigned Size = VT.getSizeInBits();
-  assert((Size == 32 || Size == 64) &&
-         "i32 or i64 is expected after legalization.");
+
+  if (Size != 32 && Size != 64)
+    return false;
 
   // Exit early if we demand all bits.
   if (DemandedBits.popcount() == Size)
diff --git a/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll
new file mode 100644
index 0000000000000..92e15e78d8c41
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Check that the following does not crash
+; See https://github.com/llvm/llvm-project/issues/123029 for details
+
+define ptr @fn(ptr %in, ptr %out) {
+; CHECK-LABEL: fn:
+; CHECK:       // %bb.0: // %fn
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    movi v0.4h, #60, lsl #8
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fcmgt v2.4s, v1.4s, #0.0
+; CHECK-NEXT:    fcmlt v1.4s, v1.4s, #0.0
+; CHECK-NEXT:    orr v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ldr h2, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    str d0, [x1]
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    fcsel s1, s2, s1, mi
+; CHECK-NEXT:    fcsel s1, s2, s1, gt
+; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-NEXT:    fcvt s1, h1
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    str h0, [x1, #8]
+; CHECK-NEXT:    ret
+fn:
+  %1 = load <4 x half>, ptr %in
+  %2 = fcmp one <4 x half> %1, zeroinitializer
+  %3 = uitofp <4 x i1> %2 to <4 x half>
+  store <4 x half> %3, ptr %out
+
+  %4 = getelementptr inbounds nuw i8, ptr %in, i64 8
+  %5 = load half, ptr %4
+  %6 = fcmp one half %5, 0xH0000
+  %7 = uitofp i1 %6 to half
+  %8 = call half @llvm.copysign.f16(half %7, half %5)
+  %9 = getelementptr inbounds nuw i8, ptr %out, i64 8
+  store half %8, ptr %9
+  ret ptr null
+}

From 9720be95d63ce797437015d0f0edd10b02e80b7a Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Fri, 17 Jan 2025 16:55:35 +0800
Subject: [PATCH 07/45] [LV][EVL] Disable fixed-order recurrence idiom with EVL
 tail folding. (#122458)

The currently llvm.splice may occurs unexpected behavior if the evl of
the second-to-last iteration is not VF*UF.

Issue #122461
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  8 +-
 ...ce-tail-with-evl-fixed-order-recurrence.ll | 90 +++++++++++--------
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 99f6a8860f0f4..8024cde41b5f9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1447,9 +1447,11 @@ class LoopVectorizationCostModel {
     // Override forced styles if needed.
     // FIXME: use actual opcode/data type for analysis here.
     // FIXME: Investigate opportunity for fixed vector factor.
-    bool EVLIsLegal = UserIC <= 1 &&
-                      TTI.hasActiveVectorLength(0, nullptr, Align()) &&
-                      !EnableVPlanNativePath;
+    bool EVLIsLegal =
+        UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) &&
+        !EnableVPlanNativePath &&
+        // FIXME: remove this once fixed-ordered recurrence is supported.
+        Legal->getFixedOrderRecurrences().empty();
     if (!EVLIsLegal) {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
index 9f8cf169c0593..809b69900731a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll
@@ -11,6 +11,10 @@
 ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
 ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP
 
+; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding.
+; The llvm.splice may occurs unexpected behavior if the evl of the
+; second-to-last iteration is not VF*UF.
+
 define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-LABEL: define void @first_order_recurrence(
 ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -27,31 +31,35 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 33, i32 [[TMP11]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP25]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP26]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP27]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[VP_OP_LOAD]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP12]])
-; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP19]], ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP27]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -172,6 +180,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -182,27 +191,30 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], 4
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP13]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 22, i32 [[TMP14]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP32:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP33:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP32]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP33]]
+; IF-EVL-NEXT:    [[TMP34:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[TMP34]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP19]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP19]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[TMP19]], [[TMP20]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP15]])
-; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP15]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[TMP34]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -218,12 +230,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL:       [[SCALAR_PH]]:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
 ; IF-EVL-NEXT:    [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
@@ -342,6 +354,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]]
 ; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
@@ -356,30 +369,33 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 4
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP16]], 1
 ; IF-EVL-NEXT:    [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 11, i32 [[TMP17]]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; IF-EVL:       [[VECTOR_BODY]]:
-; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP39:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP40:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP39]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP40]]
+; IF-EVL-NEXT:    [[TMP41:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT6]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 4, <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP22]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[VP_OP_LOAD]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP23]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR2]], <vscale x 4 x i32> [[TMP22]], i32 -1)
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP23]], i32 -1)
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[VP_OP5:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP27:%.*]] = add nsw <vscale x 4 x i32> [[TMP23]], [[TMP24]]
+; IF-EVL-NEXT:    [[TMP42:%.*]] = add <vscale x 4 x i32> [[TMP27]], [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP5]], ptr align 4 [[TMP26]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP18]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP42]], ptr [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP41]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
@@ -399,14 +415,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL:       [[SCALAR_PH]]:
 ; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
-; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; IF-EVL-NEXT:    [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ]
 ; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
 ; IF-EVL:       [[FOR_BODY]]:
 ; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
 ; IF-EVL-NEXT:    [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
 ; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]

From 0e13ce770bfbee7cfbc8086a038a950fe12c03d5 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 17 Jan 2025 16:59:04 +0800
Subject: [PATCH 08/45] [InstCombine] Handle mul in `maintainNoSignedWrap`
 (#123299)

Alive2: https://alive2.llvm.org/ce/z/Kgamks
Closes https://github.com/llvm/llvm-project/issues/123175.

For `@foo1`, the nsw flag is propagated because we first convert it into
`mul nsw nuw (shl nsw nuw X, 1), 3`.
---
 .../InstCombine/InstructionCombining.cpp      | 23 ++++---
 llvm/test/Transforms/InstCombine/nsw.ll       | 60 +++++++++++++++++++
 2 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 2fb60ef11499c..fb21576722461 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -281,28 +281,33 @@ bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
 // Return true, if No Signed Wrap should be maintained for I.
 // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
 // where both B and C should be ConstantInts, results in a constant that does
-// not overflow. This function only handles the Add and Sub opcodes. For
+// not overflow. This function only handles the Add/Sub/Mul opcodes. For
 // all other opcodes, the function conservatively returns false.
 static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
   if (!OBO || !OBO->hasNoSignedWrap())
     return false;
 
-  // We reason about Add and Sub Only.
-  Instruction::BinaryOps Opcode = I.getOpcode();
-  if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
-    return false;
-
   const APInt *BVal, *CVal;
   if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
     return false;
 
+  // We reason about Add/Sub/Mul Only.
   bool Overflow = false;
-  if (Opcode == Instruction::Add)
+  switch (I.getOpcode()) {
+  case Instruction::Add:
     (void)BVal->sadd_ov(*CVal, Overflow);
-  else
+    break;
+  case Instruction::Sub:
     (void)BVal->ssub_ov(*CVal, Overflow);
-
+    break;
+  case Instruction::Mul:
+    (void)BVal->smul_ov(*CVal, Overflow);
+    break;
+  default:
+    // Conservatively return false for other opcodes.
+    return false;
+  }
   return !Overflow;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/nsw.ll b/llvm/test/Transforms/InstCombine/nsw.ll
index 329a47324f862..b00f2e58add78 100644
--- a/llvm/test/Transforms/InstCombine/nsw.ll
+++ b/llvm/test/Transforms/InstCombine/nsw.ll
@@ -415,3 +415,63 @@ define i8 @neg_nsw_mul_missing_nsw_on_mul(i8 %a1, i8 %a2, i8 %b) {
   %neg = sub nsw i8 0, %shl
   ret i8 %neg
 }
+
+; This could propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop(
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 3
+  %b = mul nsw i16 %a, 2
+  ret i16 %b
+}
+
+; This could propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_neg(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_neg(
+; CHECK-NEXT:    [[B:%.*]] = mul nsw i16 [[X:%.*]], -2201
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, -71
+  %b = mul nsw i16 %a, 31
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_no_nsw1(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw1(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul i16 %x, 3
+  %b = mul nsw i16 %a, 2
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_no_nsw2(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw2(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], 6
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 3
+  %b = mul i16 %a, 2
+  ret i16 %b
+}
+
+; Must not propagate nsw.
+
+define i16 @mul_nsw_reassoc_prop_overflow(i16 %x) {
+; CHECK-LABEL: @mul_nsw_reassoc_prop_overflow(
+; CHECK-NEXT:    [[B:%.*]] = mul i16 [[X:%.*]], -31777
+; CHECK-NEXT:    ret i16 [[B]]
+;
+  %a = mul nsw i16 %x, 1023
+  %b = mul nsw i16 %a, 33
+  ret i16 %b
+}

From 320c2ee6c253f1bc0afe9c3d96cefb39195608f7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 17 Jan 2025 10:09:26 +0100
Subject: [PATCH 09/45] [BOLT] Pass -Wl,--build-id=none to linker in tests
 (#122886)

This fixes the following tests:

    BOLT :: AArch64/check-init-not-moved.s
    BOLT :: X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
    BOLT :: X86/dwarf5-locexpr-referrence.test

When clang is compiled with `-DENABLE_LINKER_BUILD_ID=ON`.
---
 bolt/test/lit.local.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg
index e2fa0a4a2210f..d5a6849b27a77 100644
--- a/bolt/test/lit.local.cfg
+++ b/bolt/test/lit.local.cfg
@@ -1,5 +1,5 @@
 host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu"
-common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -pie"
+common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie"
 flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}"
 
 config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))

From 58903c9b71ccb167ed1be4be9d9eddf1b2f07845 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 17 Jan 2025 10:21:54 +0100
Subject: [PATCH 10/45] [LLVM] Update AArch64 maintainers (#120440)

This merges the maintainer lists for the ARM and AArch64 backends,
as many people work on both to some degree. The list includes
focus areas where possible.
---
 llvm/Maintainers.md | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index e2af991ed37b1..10714b508ca68 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -169,10 +169,26 @@ rnk@google.com (email), [rnk](https://github.com/rnk) (GitHub)
 
 ### Backends / Targets
 
-#### AArch64 backend
+#### ARM and AArch64 backends
 
-Tim Northover \
-t.p.northover@gmail.com (email), [TNorthover](https://github.com/TNorthover) (GitHub)
+David Green \
+david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \
+Amara Emerson (esp. AArch64 GlobalISel) \
+amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \
+Eli Friedman (esp. ARM64EC) \
+efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \
+Sjoerd Meijer \
+smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \
+Nashe Mncube \
+nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \
+Sander de Smalen (esp. scalable vectorization/SVE/SME) \
+sander.desmalen@arm.com (email), [sdesmalen-arm](https://github.com/sdesmalen-arm) (GitHub) \
+Peter Smith (Anything ABI) \
+peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \
+Oliver Stannard (esp. assembly/dissassembly) \
+oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \
+Ties Stuij (Arm GlobalISel and early arch support) \
+ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub)
 
 #### AMDGPU backend
 
@@ -184,19 +200,6 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co
 Mark Schimmel \
 marksl@synopsys.com (email), [markschimmel](https://github.com/markschimmel) (GitHub)
 
-#### ARM backend
-
-David Green \
-david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \
-Oliver Stannard (Especially assembly/dissassembly) \
-oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \
-Nashe Mncube \
-nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \
-Peter Smith (Anything ABI) \
-peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \
-Ties Stuij (GlobalISel and early arch support) \
-ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub)
-
 #### AVR backend
 
 Ben Shi \
@@ -480,6 +483,7 @@ James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI, Windows codegen \
 Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \
 David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
+Tim Northover (t.p.northover@gmail.com, [TNorthover](https://github.com/TNorthover)) -- AArch64 backend \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \

From 73478708839fad8b02b3cfc84959d64a15ba93ca Mon Sep 17 00:00:00 2001
From: Karl-Johan Karlsson <karl-johan.karlsson@ericsson.com>
Date: Fri, 17 Jan 2025 10:23:27 +0100
Subject: [PATCH 11/45] [diagtool] Make the BuiltinDiagnosticsByID table sorted
 (#120321)

When building with -DLLVM_ENABLE_EXPENSIVE_CHECKS=ON with a recent
libstdc++ (e.g. from gcc 13.3.0) the testcase
clang/test/Misc/warning-flags-tree.c fail with the message:
```
+ diagtool tree --internal
 .../include/c++/13.3.0/bits/stl_algo.h:2013:
In function:
    _ForwardIterator std::lower_bound(_ForwardIterator, _ForwardIterator,
    const _Tp &, _Compare) [_ForwardIterator = const
    diagtool::DiagnosticRecord *, _Tp = diagtool::DiagnosticRecord, _Compare
    = bool (*)(const diagtool::DiagnosticRecord &, const
    diagtool::DiagnosticRecord &)]

Error: elements in iterator range [first, last) are not partitioned by the predicate __comp and value __val.

Objects involved in the operation:
    iterator "first" @ 0x7ffea8ef2fd8 {
    }
    iterator "last" @ 0x7ffea8ef2fd0 {
    }
```
The reason for this error is that std::lower_bound is called on
BuiltinDiagnosticsByID without it being entirely sorted. Calling
std::lower_bound If the range is not sorted, the behavior of this
function is undefined. This is detected when building with expensive
checks.

To make BuiltinDiagnosticsByID sorted we need to slightly change the
order the inc-files are included. The include of
DiagnosticCrossTUKinds.inc in DiagnosticNames.cpp is included too early
and should be moved down directly after DiagnosticCommentKinds.inc.

As a part of pull request the includes that build up
BuiltinDiagnosticsByID table are extracted into a common wrapper header
file AllDiagnosticKinds.inc that is used by both clang and diagtool.
---
 .../clang/Basic/AllDiagnosticKinds.inc        | 33 +++++++++++++
 clang/lib/Basic/DiagnosticIDs.cpp             | 48 ++-----------------
 clang/tools/diagtool/DiagnosticNames.cpp      | 22 ++++-----
 3 files changed, 44 insertions(+), 59 deletions(-)
 create mode 100644 clang/include/clang/Basic/AllDiagnosticKinds.inc

diff --git a/clang/include/clang/Basic/AllDiagnosticKinds.inc b/clang/include/clang/Basic/AllDiagnosticKinds.inc
new file mode 100644
index 0000000000000..a946b4a640ac6
--- /dev/null
+++ b/clang/include/clang/Basic/AllDiagnosticKinds.inc
@@ -0,0 +1,33 @@
+//===--- AllDiagnosticKinds.inc----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Defines the Diagnostic IDs in ID sorted order. The order is dictated by
+/// the enum in DiagnosticIDs.h#L49-L65.
+///
+//===----------------------------------------------------------------------===//
+
+// Turn off clang-format, as the order of the includes are important to make
+// sure tables based on Diagnostic IDs are partitioned/sorted based on
+// DiagID.
+
+// clang-format off
+#include "clang/Basic/DiagnosticCommonKinds.inc"
+#include "clang/Basic/DiagnosticDriverKinds.inc"
+#include "clang/Basic/DiagnosticFrontendKinds.inc"
+#include "clang/Basic/DiagnosticSerializationKinds.inc"
+#include "clang/Basic/DiagnosticLexKinds.inc"
+#include "clang/Basic/DiagnosticParseKinds.inc"
+#include "clang/Basic/DiagnosticASTKinds.inc"
+#include "clang/Basic/DiagnosticCommentKinds.inc"
+#include "clang/Basic/DiagnosticCrossTUKinds.inc"
+#include "clang/Basic/DiagnosticSemaKinds.inc"
+#include "clang/Basic/DiagnosticAnalysisKinds.inc"
+#include "clang/Basic/DiagnosticRefactoringKinds.inc"
+#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+// clang-format on
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index d77f28c80b2eb..81194bbf2538e 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -37,21 +37,7 @@ struct StaticDiagInfoDescriptionStringTable {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   char ENUM##_desc[sizeof(DESC)];
-  // clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-  // clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -59,21 +45,7 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   DESC,
-// clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-// clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -85,21 +57,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY)            \
   offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc),
-// clang-format off
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
-// clang-format on
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp
index eb90f082437b3..c3a3002889c73 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -23,26 +23,13 @@ llvm::ArrayRef<DiagnosticRecord> diagtool::getBuiltinDiagnosticsByName() {
   return llvm::ArrayRef(BuiltinDiagnosticsByName);
 }
 
-
 // FIXME: Is it worth having two tables, especially when this one can get
 // out of sync easily?
 static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR,      \
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY)                 \
   {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)},
-#include "clang/Basic/DiagnosticCommonKinds.inc"
-#include "clang/Basic/DiagnosticCrossTUKinds.inc"
-#include "clang/Basic/DiagnosticDriverKinds.inc"
-#include "clang/Basic/DiagnosticFrontendKinds.inc"
-#include "clang/Basic/DiagnosticSerializationKinds.inc"
-#include "clang/Basic/DiagnosticLexKinds.inc"
-#include "clang/Basic/DiagnosticParseKinds.inc"
-#include "clang/Basic/DiagnosticASTKinds.inc"
-#include "clang/Basic/DiagnosticCommentKinds.inc"
-#include "clang/Basic/DiagnosticSemaKinds.inc"
-#include "clang/Basic/DiagnosticAnalysisKinds.inc"
-#include "clang/Basic/DiagnosticRefactoringKinds.inc"
-#include "clang/Basic/DiagnosticInstallAPIKinds.inc"
+#include "clang/Basic/AllDiagnosticKinds.inc"
 #undef DIAG
 };
 
@@ -54,6 +41,13 @@ static bool orderByID(const DiagnosticRecord &Left,
 const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) {
   DiagnosticRecord Key = {nullptr, DiagID, 0};
 
+  // The requirement for lower_bound to produce a valid result it is
+  // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID),
+  // but as we want this function to work for all possible values of
+  // DiagID sent in as argument it is better to right away check if
+  // BuiltinDiagnosticsByID is sorted.
+  assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) &&
+         "IDs in BuiltinDiagnosticsByID must be sorted.");
   const DiagnosticRecord *Result =
       llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID);
   assert(Result && "diagnostic not found; table may be out of date");

From 89e3a649f207021c0884ed5f8e56321c51854ac3 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Fri, 17 Jan 2025 17:29:22 +0800
Subject: [PATCH 12/45] [LoongArch] Emit R_LARCH_RELAX when expanding some
 macros (#120067)

Emit `R_LARCH_RELAX` relocations when expanding some macros, including:

- `la.tls.ie`, `la.tls.ld`, `la.tls.gd`, `la.tls.desc`,
- `call36`, `tail36`.

Other macros that need to emit `R_LARCH_RELAX` relocations was
implemented in https://github.com/llvm/llvm-project/pull/72961, including:

- `la.local`, `la.pcrel`, `la.pcrel` expanded as `la.abs`, `la`,
`la.global`, `la/la.global` expanded as `la.pcrel`, `la.got`.

Note: `la.tls.le` macro can be relaxed when expanded with
`R_LARCH_TLS_LE_{HI20/ADD/LO12}_R` relocations. But if we do so,
previously handwritten assembly code will occur error due to the
redundant `add.{w/d}` followed by `la.tls.le`. So `la.tls.le` keeps to
expands with `R_LARCH_TLS_LE_{HI20/LO12}`.
---
 .../AsmParser/LoongArchAsmParser.cpp          | 23 +++++---
 llvm/test/MC/LoongArch/Macros/aliases-la.s    | 52 +++++++++++++++++++
 llvm/test/MC/LoongArch/Macros/macros-call.s   | 17 ++++++
 llvm/test/MC/LoongArch/Macros/macros-la.s     | 20 +++++++
 4 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index efc8b77f8d8fa..420b98b8a9c1f 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -1009,7 +1009,8 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1083,7 +1084,8 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1176,7 +1178,8 @@ void LoongArchAsmParser::emitLoadAddressTLSIE(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       LD, LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSIELarge(MCInst &Inst, SMLoc IDLoc,
@@ -1248,7 +1251,8 @@ void LoongArchAsmParser::emitLoadAddressTLSLD(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSLDLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1320,7 +1324,8 @@ void LoongArchAsmParser::emitLoadAddressTLSGD(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1409,7 +1414,8 @@ void LoongArchAsmParser::emitLoadAddressTLSDesc(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(LoongArchAsmParser::Inst(
       LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out,
+                /*RelaxHint=*/true);
 }
 
 void LoongArchAsmParser::emitLoadAddressTLSDescLarge(MCInst &Inst, SMLoc IDLoc,
@@ -1500,8 +1506,9 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
       IsTailCall ? Inst.getOperand(0).getReg() : MCRegister(LoongArch::R1);
   const MCExpr *Sym =
       IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
-  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
-      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+  const LoongArchMCExpr *LE =
+      LoongArchMCExpr::create(Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36,
+                              getContext(), /*RelaxHint=*/true);
 
   Out.emitInstruction(
       MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
diff --git a/llvm/test/MC/LoongArch/Macros/aliases-la.s b/llvm/test/MC/LoongArch/Macros/aliases-la.s
index dd5a4d474e001..1b5b818f4348f 100644
--- a/llvm/test/MC/LoongArch/Macros/aliases-la.s
+++ b/llvm/test/MC/LoongArch/Macros/aliases-la.s
@@ -3,13 +3,26 @@
 
 # RUN: llvm-mc --triple=loongarch64 %s \
 # RUN:     | FileCheck %s --check-prefix=NORMAL
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-pcrel < %s \
 # RUN:     | FileCheck %s --check-prefix=GTOPCR
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \
+# RUN:     --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOPCR-RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \
+# RUN:     --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOPCR-RELOC,GTOPCR-RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs < %s \
 # RUN:     | FileCheck %s --check-prefix=GTOABS
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-local-with-abs < %s \
 # RUN:     | FileCheck %s --check-prefix=LTOABS
 
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
+
 la $a0, sym
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym)
 # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym)
@@ -22,6 +35,16 @@ la $a0, sym
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+# GTOPCR-RELOC: R_LARCH_PCALA_HI20 sym 0x0
+# GTOPCR-RELAX: R_LARCH_RELAX - 0x0
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.global $a0, sym_global
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym_global)
 # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_global)
@@ -34,6 +57,16 @@ la.global $a0, sym_global
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_HI20 sym_global 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_global 0x0
+# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.global $a0, $a1, sym_global_large
 # NORMAL:      pcalau12i $a0, %got_pc_hi20(sym_global_large)
 # NORMAL-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_global_large)
@@ -52,6 +85,11 @@ la.global $a0, $a1, sym_global_large
 # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global_large)
 # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global_large)
 
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_global_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_global_large 0x0
+
 la.local $a0, sym_local
 # NORMAL:      pcalau12i $a0, %pc_hi20(sym_local)
 # NORMAL-NEXT: addi.d $a0, $a0, %pc_lo12(sym_local)
@@ -61,6 +99,11 @@ la.local $a0, sym_local
 # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local)
 # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local)
 
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 la.local $a0, $a1, sym_local_large
 # NORMAL:      pcalau12i $a0, %pc_hi20(sym_local_large)
 # NORMAL-NEXT: addi.d $a1, $zero, %pc_lo12(sym_local_large)
@@ -72,3 +115,12 @@ la.local $a0, $a1, sym_local_large
 # LTOABS-NEXT: ori $a0, $a0, %abs_lo12(sym_local_large)
 # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local_large)
 # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local_large)
+
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_local_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_local_large 0x0
+
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
index a648a39780381..df7715050a0f9 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-call.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
@@ -1,9 +1,26 @@
 # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
+
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
 
 call36 sym_call
 # CHECK:      pcaddu18i $ra, %call36(sym_call)
 # CHECK-NEXT: jirl $ra, $ra, 0
 
+# RELOC-NEXT: R_LARCH_CALL36 sym_call 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
 tail36 $t0, sym_tail
 # CHECK:      pcaddu18i $t0, %call36(sym_tail)
 # CHECK-NEXT: jr $t0
+
+# RELOC-NEXT: R_LARCH_CALL36 sym_tail 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index d4272b93ba54d..a732988ef1f1a 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -5,6 +5,12 @@
 # RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
 # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs \
 # RUN:     %s | FileCheck %s --check-prefix=ABS
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \
+# RUN:     --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOABS-RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \
+# RUN:     --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOABS-RELOC,GTOABS-RELAX
 
 # RELOC:      Relocations [
 # RELOC-NEXT:   Section ({{.*}}) .rela.text {
@@ -36,6 +42,10 @@ la.pcrel $a0, sym_pcrel
 # RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
 # RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOABS-RELOC: R_LARCH_PCALA_HI20 sym_pcrel 0x0
+# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0
+# GTOABS-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
+# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.got $a0, sym_got
 # CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got)
@@ -73,7 +83,9 @@ la.tls.ie $a0, sym_ie
 # ABS-NEXT:   ld.d $a0, $a0, 0
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.ld $a0, sym_ld
 # CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld)
@@ -85,7 +97,9 @@ la.tls.ld $a0, sym_ld
 # ABS-NEXT:   lu52i.d $a0, $a0, %got64_hi12(sym_ld)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.gd $a0, sym_gd
 # CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd)
@@ -97,7 +111,9 @@ la.tls.gd $a0, sym_gd
 # ABS-NEXT:   lu52i.d $a0, $a0, %got64_hi12(sym_gd)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.tls.desc $a0, sym_desc
 # CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc)
@@ -113,9 +129,13 @@ la.tls.desc $a0, sym_desc
 # ABS-NEXT:   jirl $ra, $ra, %desc_call(sym_desc)
 # ABS-EMPTY:
 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 # RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 #############################################################
 ## with a temporary register.

From 31b62e2d3df86487e7443608b5a84df754b571fd Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Fri, 17 Jan 2025 17:30:57 +0800
Subject: [PATCH 13/45] [LoongArch] Add relax relocations for tls_le code
 sequence (#121329)

This commit add relax relocations for `tls_le` code sequence.
Handwritten assembly and generating source code by clang are both
affected.

Scheduled `tls_le` code sequence can be relaxed normally and we can add
relax relocs when code emitting according to their relocs. Other
relaxable macros' code sequence cannot simply add relax relocs according
to their relocs, such as `PCALA_{HI20/LO12}`, we do not want to add
relax relocs when code model is large. This will be implemented in later
commit.
---
 .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   | 10 ++-
 .../MC/LoongArch/Relocations/relax-tls-le.s   | 70 +++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/MC/LoongArch/Relocations/relax-tls-le.s

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 359bde1244429..04d57f0fe7457 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -282,9 +282,11 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_hi20_r;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R:
       FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r;
+      RelaxCandidate = true;
       break;
     case LoongArchMCExpr::VK_LoongArch_PCREL20_S2:
       FixupKind = LoongArch::fixup_loongarch_pcrel20_s2;
@@ -387,11 +389,17 @@ void LoongArchMCCodeEmitter::expandAddTPRel(const MCInst &MI,
          "Expected %le_add_r relocation on TP-relative symbol");
 
   // Emit the correct %le_add_r relocation for the symbol.
-  // TODO: Emit R_LARCH_RELAX for %le_add_r where the relax feature is enabled.
   Fixups.push_back(MCFixup::create(
       0, Expr, MCFixupKind(LoongArch::fixup_loongarch_tls_le_add_r),
       MI.getLoc()));
 
+  // Emit R_LARCH_RELAX for %le_add_r when the relax feature is enabled.
+  if (STI.hasFeature(LoongArch::FeatureRelax)) {
+    const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+    Fixups.push_back(MCFixup::create(
+        0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc()));
+  }
+
   // Emit a normal ADD instruction with the given operands.
   unsigned ADD = MI.getOpcode() == LoongArch::PseudoAddTPRel_D
                      ? LoongArch::ADD_D
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s
new file mode 100644
index 0000000000000..899f12f85654d
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA32-RELAX-RELOC %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=-relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA32-NORELAX-RELOC %s
+# RUN: llvm-mc --triple=loongarch32 --mattr=+relax < %s --show-encoding \
+# RUN:     | FileCheck --check-prefix=LA32-RELAX-FIXUP %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA64-RELAX-RELOC %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax --defsym=LA64=1 < %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=LA64-NORELAX-RELOC %s
+# RUN: llvm-mc --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s --show-encoding \
+# RUN:     | FileCheck --check-prefix=LA64-RELAX-FIXUP %s
+
+.long foo
+
+.ifndef LA64
+
+lu12i.w $a0, %le_hi20_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+add.w $a0, $a0, $tp, %le_add_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+addi.w $a0, $a0, %le_lo12_r(foo)
+# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA32-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+.else
+
+lu12i.w $a0, %le_hi20_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+add.d $a0, $a0, $tp, %le_add_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+addi.d $a0, $a0, %le_lo12_r(foo)
+# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0
+# LA64-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0
+# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0
+# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE
+# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE
+
+.endif
+

From 30e276d06d3176f145151cea96ab01af0c3e842a Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 17 Jan 2025 09:35:02 +0000
Subject: [PATCH 14/45] [clang][PCH] Don't try to create standalone debug-info
 for types marked nodebug (#123253)

Fixes one of the crashes uncovered by
https://github.com/llvm/llvm-project/pull/118710

`getOrCreateStandaloneType` asserts that a `DIType` was created for the
requested type. If the `Decl` was marked `nodebug`, however, we can't
generate debug-info for it, so we would previously trigger the assert.
For now keep the assertion around and check the `nodebug` at the
callsite.
---
 clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp |  3 +++
 clang/test/Modules/gmodules-nodebug.cpp            | 14 ++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 clang/test/Modules/gmodules-nodebug.cpp

diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
index 5447b98d7105e..02635ce235a12 100644
--- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
+++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp
@@ -81,6 +81,9 @@ class PCHContainerGenerator : public ASTConsumer {
         if (!TD->isCompleteDefinition())
           return true;
 
+      if (D->hasAttr<NoDebugAttr>())
+        return true;
+
       QualType QualTy = Ctx.getTypeDeclType(D);
       if (!QualTy.isNull() && CanRepresent(QualTy.getTypePtr()))
         DI.getOrCreateStandaloneType(QualTy, D->getLocation());
diff --git a/clang/test/Modules/gmodules-nodebug.cpp b/clang/test/Modules/gmodules-nodebug.cpp
new file mode 100644
index 0000000000000..d83103768e838
--- /dev/null
+++ b/clang/test/Modules/gmodules-nodebug.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: asserts
+
+// RUN: %clang_cc1 -std=c++23 -x c++-header -emit-pch -fmodule-format=obj \
+// RUN:   -o %t.pch %s \
+// RUN:   -mllvm -debug-only=pchcontainer &>%t-pch.ll
+// RUN: cat %t-pch.ll | FileCheck %s
+
+template<class...>                     
+using __void_t [[gnu::nodebug]] = void;
+                                       
+__void_t<> func() {}                   
+
+// CHECK: !DICompileUnit
+// CHECK-NOT: __void_t

From d7e48fbf205a01fcbc109b2555b12aa0d37845a4 Mon Sep 17 00:00:00 2001
From: NimishMishra <42909663+NimishMishra@users.noreply.github.com>
Date: Fri, 17 Jan 2025 15:10:33 +0530
Subject: [PATCH 15/45] [llvm][OpenMP] Add implicit cast to omp.atomic.read
 (#114659)

Should the operands of `omp.atomic.read` differ, emit an implicit cast.
In case of `struct` arguments, extract the 0-th index, emit an implicit
cast if required, and store at the destination.

Fixes https://github.com/llvm/llvm-project/issues/112908
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 31 ++++++++++
 mlir/test/Target/LLVMIR/openmp-llvm.mlir  | 71 +++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 188a450d12fde..7dbf65fbf055b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -264,6 +264,33 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
   return Result;
 }
 
+/// Emit an implicit cast to convert \p XRead to type of variable \p V
+static llvm::Value *emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead,
+                                     llvm::Value *V) {
+  // TODO: Add this functionality to the `AtomicInfo` interface
+  llvm::Type *XReadType = XRead->getType();
+  llvm::Type *VType = V->getType();
+  if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V))
+    VType = vAlloca->getAllocatedType();
+
+  if (XReadType->isStructTy() && VType->isStructTy())
+    // No need to extract or convert. A direct
+    // `store` will suffice.
+    return XRead;
+
+  if (XReadType->isStructTy())
+    XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0);
+  if (VType->isIntegerTy() && XReadType->isFloatingPointTy())
+    XRead = Builder.CreateFPToSI(XRead, VType);
+  else if (VType->isFloatingPointTy() && XReadType->isIntegerTy())
+    XRead = Builder.CreateSIToFP(XRead, VType);
+  else if (VType->isIntegerTy() && XReadType->isIntegerTy())
+    XRead = Builder.CreateIntCast(XRead, VType, true);
+  else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy())
+    XRead = Builder.CreateFPCast(XRead, VType);
+  return XRead;
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
@@ -8501,6 +8528,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
     }
   }
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
+  if (XRead->getType() != V.Var->getType())
+    XRead = emitImplicitCast(Builder, XRead, V.Var);
   Builder.CreateStore(XRead, V.Var, V.IsVolatile);
   return Builder.saveIP();
 }
@@ -8785,6 +8814,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
     return AtomicResult.takeError();
   Value *CapturedVal =
       (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
+  if (CapturedVal->getType() != V.Var->getType())
+    CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var);
   Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
 
   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 390ecabaef21b..4e4b9e5698fe9 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -1368,6 +1368,77 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () {
 
 // -----
 
+// CHECK-LABEL: @omp_atomic_read_implicit_cast
+llvm.func @omp_atomic_read_implicit_cast () {
+//CHECK: %[[Z:.*]] = alloca float, i64 1, align 4
+//CHECK: %[[Y:.*]] = alloca double, i64 1, align 8
+//CHECK: %[[X:.*]] = alloca [2 x { float, float }], i64 1, align 8
+//CHECK: %[[W:.*]] = alloca i32, i64 1, align 4
+//CHECK: %[[X_ELEMENT:.*]] = getelementptr { float, float }, ptr %3, i64 0
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x f32 {bindc_name = "z"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  %3 = llvm.alloca %2 x f64 {bindc_name = "y"} : (i64) -> !llvm.ptr
+  %4 = llvm.mlir.constant(1 : i64) : i64
+  %5 = llvm.alloca %4 x !llvm.array<2 x struct<(f32, f32)>> {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %6 = llvm.mlir.constant(1 : i64) : i64
+  %7 = llvm.alloca %6 x i32 {bindc_name = "w"} : (i64) -> !llvm.ptr
+  %8 = llvm.mlir.constant(1 : index) : i64
+  %9 = llvm.mlir.constant(2 : index) : i64
+  %10 = llvm.mlir.constant(1 : i64) : i64
+  %11 = llvm.mlir.constant(0 : i64) : i64
+  %12 = llvm.sub %8, %10 overflow<nsw> : i64
+  %13 = llvm.mul %12, %10 overflow<nsw> : i64
+  %14 = llvm.mul %13, %10 overflow<nsw> : i64
+  %15 = llvm.add %14, %11 overflow<nsw> : i64
+  %16 = llvm.mul %10, %9 overflow<nsw> : i64
+  %17 = llvm.getelementptr %5[%15] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(f32, f32)>
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8
+//CHECK: call void @__atomic_load(i64 8, ptr %[[X_ELEMENT]], ptr %[[ATOMIC_LOAD_TEMP]], i32 0)
+//CHECK: %[[LOAD:.*]] = load { float, float }, ptr %[[ATOMIC_LOAD_TEMP]], align 8
+//CHECK: %[[EXT:.*]] = extractvalue { float, float } %[[LOAD]], 0
+//CHECK: store float %[[EXT]], ptr %[[Y]], align 4
+  omp.atomic.read %3 = %17 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)>
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: %[[LOAD:.*]] = fpext float %[[CAST]] to double
+//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8
+  omp.atomic.read %3 = %1 : !llvm.ptr, !llvm.ptr, f32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4
+//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8
+  omp.atomic.read %3 = %7 : !llvm.ptr, !llvm.ptr, i32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: %[[LOAD:.*]] = fptrunc double %[[CAST]] to float
+//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4
+  omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, f64
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4
+//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4
+  omp.atomic.read %1 = %7 : !llvm.ptr, !llvm.ptr, i32
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double
+//CHECK: %[[LOAD:.*]] = fptosi double %[[CAST]] to i32
+//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4
+  omp.atomic.read %7 = %3 : !llvm.ptr, !llvm.ptr, f64
+
+//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4
+//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float
+//CHECK: %[[LOAD:.*]] = fptosi float %[[CAST]] to i32
+//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4
+  omp.atomic.read %7 = %1 : !llvm.ptr, !llvm.ptr, f32
+  llvm.return
+}
+
+// -----
+
 // CHECK-LABEL: @omp_atomic_write
 // CHECK-SAME: (ptr %[[x:.*]], i32 %[[expr:.*]])
 llvm.func @omp_atomic_write(%x: !llvm.ptr, %expr: i32) -> () {

From fbb9d49506baa05a613ab88f983d31e0f838dbae Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 17 Jan 2025 17:51:42 +0800
Subject: [PATCH 16/45] [X86][APX] Support APX + AMX-MOVRS/AMX-TRANSPOSE
 (#123267)

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/784266
---
 llvm/lib/Target/X86/X86ExpandPseudo.cpp       |  20 +--
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  24 +--
 llvm/lib/Target/X86/X86InstrAMX.td            |  75 +++++----
 llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll |  89 +++++++++++
 .../X86/amx_movrs_transpose_intrinsics.ll     |  30 ++++
 .../CodeGen/X86/amx_transpose_intrinsics.ll   | 146 ++++++++++++++++++
 .../Disassembler/X86/AMX/x86-64-amx-movrs.txt |  96 ++++++++++++
 .../MC/Disassembler/X86/amx-transpose-att.txt |  48 ++++++
 llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s   |  90 ++++++++++-
 llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s |  96 ++++++++++++
 llvm/test/MC/X86/amx-transpose-att.s          |  48 ++++++
 llvm/test/MC/X86/amx-transpose-intel.s        |  48 ++++++
 llvm/test/TableGen/x86-instr-mapping.inc      |  10 ++
 13 files changed, 758 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index fc8a0eaed140d..7fbba7f05e0a5 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -578,10 +578,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PTILELOADDRSV:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1V:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     case X86::PTILELOADDV:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -737,28 +737,28 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned Opc;
     switch (Opcode) {
     case X86::PT2RPNTLVWZ0V:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1V:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1V:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1V:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RSV:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1V:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RSV:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1V:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     default:
       llvm_unreachable("Impossible Opcode!");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90e3e15b1fb46..6d69665c17565 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37800,14 +37800,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTILESTORED:
       Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
       break;
-#undef GET_EGPR_IF_ENABLED
     case X86::PTILELOADDRS:
-      Opc = X86::TILELOADDRS;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
       break;
     case X86::PTILELOADDRST1:
-      Opc = X86::TILELOADDRST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     unsigned CurOp = 0;
@@ -37838,34 +37838,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PT2RPNTLVWZ1RST1: {
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
+#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected instruction!");
     case X86::PT2RPNTLVWZ0:
-      Opc = X86::T2RPNTLVWZ0;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
       break;
     case X86::PT2RPNTLVWZ0T1:
-      Opc = X86::T2RPNTLVWZ0T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
       break;
     case X86::PT2RPNTLVWZ1:
-      Opc = X86::T2RPNTLVWZ1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
       break;
     case X86::PT2RPNTLVWZ1T1:
-      Opc = X86::T2RPNTLVWZ1T1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
       break;
     case X86::PT2RPNTLVWZ0RS:
-      Opc = X86::T2RPNTLVWZ0RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
       break;
     case X86::PT2RPNTLVWZ0RST1:
-      Opc = X86::T2RPNTLVWZ0RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
       break;
     case X86::PT2RPNTLVWZ1RS:
-      Opc = X86::T2RPNTLVWZ1RS;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
       break;
     case X86::PT2RPNTLVWZ1RST1:
-      Opc = X86::T2RPNTLVWZ1RST1;
+      Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
       break;
     }
+#undef GET_EGPR_IF_ENABLED
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
     MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
 
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index a055ba91d3e17..85046228bc8c5 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -345,26 +345,33 @@ let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSys
   def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>;
 }
 
-let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}",
-                        []>, VEX, WIG, T8,PS;
+multiclass T2RPNTLVW_Base<bits<8> op1, bits<8> op2, string rs, string suffix> {
+  def Z0#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz0" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PS;
+  def Z0#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz0" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PS;
+  def Z1#rs#suffix    : I<op1, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz1" #!tolower(rs)# "\t{$src, $dst|$dst, $src}", []>, PD;
+  def Z1#rs#T1#suffix : I<op2, MRMSrcMemFSIB, (outs TILEPair:$dst), (ins sibmem:$src),
+                          "t2rpntlvwz1" #!tolower(rs)# "t1\t{$src, $dst|$dst, $src}", []>, PD;
+}
 
-    def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                          (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PS;
+let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX;
 
-    def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}",
-                        []>, VEX, T8,PD;
+let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8;
 
-    def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                          (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}",
-                          []>, VEX, T8,PD;
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX;
 
+let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8;
+
+let Predicates = [HasAMXTRANSPOSE, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
     def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src),
-                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS;
+                        "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS;
     let isPseudo = true in {
       def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst),
                                   (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -491,22 +498,6 @@ let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [Write
 }
 
 let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in {
-  def T2RPNTLVWZ0RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5;
-  def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5;
-  def T2RPNTLVWZ1RS   : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5, PD;
-  def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst),
-                        (ins sibmem:$src1),
-                        "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}",
-                        []>, VEX, T_MAP5, PD;
   let isPseudo = true in {
     def PT2RPNTLVWZ0RSV   : PseudoI<(outs TILEPair:$dst),
                               (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4),
@@ -529,16 +520,20 @@ let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSy
   }
 } // HasAMXMOVRS, HasAMXTRANSPOSE
 
-let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
-  def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
-                    (ins sibmem:$src1),
-                    "tileloaddrs\t{$src1, $dst|$dst, $src1}",
-                    []>, VEX, T8, XD;
-  def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst),
-                    (ins sibmem:$src1),
-                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}",
-                    []>, VEX, T8, PD;
+multiclass TILELOADDRS_Base<string suffix> {
+  def suffix    : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
+                    "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD;
+  def T1#suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1),
+                    "tileloaddrst1\t{$src1, $dst|$dst, $src1}", []>, T8, PD;
+}
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in
+  defm TILELOADDRS : TILELOADDRS_Base<"">, VEX;
 
+let Predicates = [HasAMXMOVRS, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in
+  defm TILELOADDRS : TILELOADDRS_Base<"_EVEX">, EVEX, NoCD8;
+
+let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
   let isPseudo = true, mayLoad = 1 in {
     def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
                                                   GR16:$src2,
diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
index da212a1850964..1b93ae029f27b 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-LABEL: test_amx_internal:
@@ -35,6 +36,44 @@ define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_internal:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    .cfi_def_cfa_offset 16
+; EGPR-NEXT:    .cfi_offset %rbp, -16
+; EGPR-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; EGPR-NEXT:    .cfi_def_cfa_register %rbp
+; EGPR-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; EGPR-NEXT:    # imm = 0xFC00
+; EGPR-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xC00
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; EGPR-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; EGPR-NEXT:    # implicit-def: $al
+; EGPR-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloaddrs (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x04,0x32]
+; EGPR-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; EGPR-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    .cfi_def_cfa %rsp, 8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -48,6 +87,12 @@ define void @test_amx_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrs (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_old:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloaddrs (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x14,0x02]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32)
   ret void
@@ -88,6 +133,44 @@ define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_t1_internal:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    .cfi_def_cfa_offset 16
+; EGPR-NEXT:    .cfi_offset %rbp, -16
+; EGPR-NEXT:    movq %rsp, %rbp # encoding: [0x48,0x89,0xe5]
+; EGPR-NEXT:    .cfi_def_cfa_register %rbp
+; EGPR-NEXT:    andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff]
+; EGPR-NEXT:    # imm = 0xFC00
+; EGPR-NEXT:    subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xC00
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00]
+; EGPR-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %ax, %cx # encoding: [0x66,0x89,0xc1]
+; EGPR-NEXT:    movw %di, %ax # encoding: [0x66,0x89,0xf8]
+; EGPR-NEXT:    # implicit-def: $al
+; EGPR-NEXT:    movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00]
+; EGPR-NEXT:    movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloaddrst1 (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x04,0x32]
+; EGPR-NEXT:    movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32]
+; EGPR-NEXT:    movq %rbp, %rsp # encoding: [0x48,0x89,0xec]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    .cfi_def_cfa %rsp, 8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
@@ -101,6 +184,12 @@ define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) {
 ; CHECK-NEXT:    movl $32, %eax
 ; CHECK-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_t1_old:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloaddrst1 (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x14,0x02]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32)
   ret void
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
index 146b69773eb18..1f5758c804b2b 100755
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
 ; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-LABEL: test_amx:
@@ -10,6 +11,14 @@ define void @test_amx(i64 %stride, i8* %addr1) #0 {
 ; CHECK-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0
 ; CHECK-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
+; EGPR-NEXT:    t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
+; EGPR-NEXT:    t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
+; EGPR-NEXT:    t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
@@ -80,6 +89,27 @@ define void @test_amx2(i8* %base, i64 %stride) #0 {
 ; O2-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
 ; O2-NEXT:    tilerelease
 ; O2-NEXT:    retq
+;
+; EGPR-LABEL: test_amx2:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
+; EGPR-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
+; EGPR-NEXT:    t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
index cc4360317db7d..4cfd97afe721b 100644
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
 ; CHECK-LABEL: test_amx:
@@ -16,6 +17,21 @@ define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x floa
 ; CHECK-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
 ; CHECK-NEXT:    tconjtfp16 %tmm2, %tmm1
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
+; EGPR-NEXT:    t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
+; EGPR-NEXT:    ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
+; EGPR-NEXT:    ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
+; EGPR-NEXT:    ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
+; EGPR-NEXT:    ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
+; EGPR-NEXT:    ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
+; EGPR-NEXT:    tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
+; EGPR-NEXT:    tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
   call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
@@ -78,6 +94,46 @@ define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx2:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xB70
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
+; EGPR-NEXT:    tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
+; EGPR-NEXT:    tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
+; EGPR-NEXT:    ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
+; EGPR-NEXT:    ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
+; EGPR-NEXT:    ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
+; EGPR-NEXT:    ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
+; EGPR-NEXT:    movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
+; EGPR-NEXT:    tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
+; EGPR-NEXT:    tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
+; EGPR-NEXT:    tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
+; EGPR-NEXT:    tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
+; EGPR-NEXT:    addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xB70
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
@@ -117,6 +173,30 @@ define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx3:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
+; EGPR-NEXT:    movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
+; EGPR-NEXT:    movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
+; EGPR-NEXT:    ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
+; EGPR-NEXT:    tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
   %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
   %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
@@ -179,6 +259,72 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx_spill:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x17C8
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
+; EGPR-NEXT:    movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
+; EGPR-NEXT:    movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
+; EGPR-NEXT:    movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
+; EGPR-NEXT:    movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
+; EGPR-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
+; EGPR-NEXT:    movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
+; EGPR-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
+; EGPR-NEXT:    tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
+; EGPR-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
+; EGPR-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
+; EGPR-NEXT:    tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
+; EGPR-NEXT:    tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
+; EGPR-NEXT:    tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
+; EGPR-NEXT:    addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x17C8
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
   %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
   %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
index 6df44c87d2332..57e3153da401b 100755
--- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt
@@ -96,3 +96,99 @@
 # ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rs 64(%r18), %tmm6
+# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64]
+0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32]
+0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0rst1 64(%r18), %tmm6
+# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64]
+0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rs 64(%r18), %tmm6
+# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64]
+0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32]
+0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1rst1 64(%r18), %tmm6
+# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64]
+0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40
+
+# ATT:   t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrs 268435456(%r16,%r14,8), %tmm6
+# INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrs 291(%r8,%r17,4), %tmm3
+# INTEL: tileloaddrs tmm3, [r8 + 4*r17 + 291]
+0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrs 64(%r18), %tmm6
+# INTEL: tileloaddrs tmm6, [r18 + 64]
+0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40
+
+# ATT:   tileloaddrs -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrs tmm3, [2*rbp - 32]
+0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   tileloaddrst1 268435456(%r16,%r14,8), %tmm6
+# INTEL: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   tileloaddrst1 291(%r8,%r17,4), %tmm3
+# INTEL: tileloaddrst1 tmm3, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddrst1 64(%r18), %tmm6
+# INTEL: tileloaddrst1 tmm6, [r18 + 64]
+0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40
+
+# ATT:   tileloaddrst1 -32(,%rbp,2), %tmm3
+# INTEL: tileloaddrst1 tmm3, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff
diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
index 8c6f1be80ba2d..d768630ac1475 100644
--- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
+++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt
@@ -49,6 +49,54 @@
 # INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
 0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
 
+# ATT:   t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff
+
+# ATT:   t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
+# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
+0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10
+
+# ATT:   t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
+0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00
+
+# ATT:   t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff
+
 # ATT:   ttransposed %tmm1, %tmm2
 # INTEL: ttransposed tmm2, tmm1
 0xc4,0xe2,0x7a,0x5f,0xd1
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
index d780ad4f0e369..92db672e1c82d 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s
@@ -86,4 +86,92 @@
 
 // CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
-          tileloaddrst1 -32(,%rbp,2), %tmm3
\ No newline at end of file
+          tileloaddrst1 -32(,%rbp,2), %tmm3
+
+// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rs   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz0rs   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz0rs 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz0rs   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz0rst1   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz0rst1   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz0rst1 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz0rst1   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rs   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz1rs   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz1rs 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz1rs   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6
+
+// CHECK: t2rpntlvwz1rst1   291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2
+
+// CHECK: t2rpntlvwz1rst1   64(%r18), %tmm6
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz1rst1 64(%r18), %tmm6
+
+// CHECK: {evex}  t2rpntlvwz1rst1   -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2
+
+// CHECK: tileloaddrs     291(%r16,%rax,4), %tmm3
+// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r16,%rax,4), %tmm3
+
+// CHECK: tileloaddrs     291(%r8,%r17,4), %tmm3
+// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrs 291(%r8,%r17,4), %tmm3
+
+// CHECK: {evex}  tileloaddrs     -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrs -32(,%rbp,2), %tmm3
+
+// CHECK: tileloaddrst1     291(%r16,%rax,4), %tmm3
+// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r16,%rax,4), %tmm3
+
+// CHECK: tileloaddrst1     291(%r8,%r17,4), %tmm3
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrst1 291(%r8,%r17,4), %tmm3
+
+// CHECK: {evex}  tileloaddrst1     -32(,%rbp,2), %tmm3
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrst1 -32(,%rbp,2), %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
index ccc7ac51a98a4..140d1aa6b198e 100755
--- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s
@@ -95,3 +95,99 @@
 // CHECK: tileloaddrst1 tmm3, [2*rbp - 32]
 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
           tileloaddrst1 tmm3, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz0rs tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz0rst1 tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40]
+          t2rpntlvwz1rs tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40]
+          t2rpntlvwz1rst1 tmm6, [r18 + 64]
+
+// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32]
+
+// CHECK: tileloaddrs     tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          tileloaddrs tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: tileloaddrs     tmm3, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrs tmm3, [r8 + 4*r17 + 291]
+
+// CHECK: tileloaddrs     tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40]
+          tileloaddrs tmm6, [r18 + 64]
+
+// CHECK: {evex}  tileloaddrs     tmm3, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrs tmm3, [2*rbp - 32]
+
+// CHECK: tileloaddrst1     tmm6, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10]
+          tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456]
+
+// CHECK: tileloaddrst1     tmm3, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00]
+          tileloaddrst1 tmm3, [r8 + 4*r17 + 291]
+
+// CHECK: tileloaddrst1     tmm6, [r18 + 64]
+// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40]
+          tileloaddrst1 tmm6, [r18 + 64]
+
+// CHECK: {evex}  tileloaddrst1     tmm3, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} tileloaddrst1 tmm3, [2*rbp - 32]
diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s
index 21bbf258ac6ef..5158470f8c905 100644
--- a/llvm/test/MC/X86/amx-transpose-att.s
+++ b/llvm/test/MC/X86/amx-transpose-att.s
@@ -48,6 +48,54 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
           t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
 
+// CHECK: t2rpntlvwz0     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz0     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz0t1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz0t1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz0t1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2
+
+// CHECK: t2rpntlvwz1t1     268435456(%r16,%r14,8), %tmm4
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4
+
+// CHECK: t2rpntlvwz1t1     291(%r8,%r17,4), %tmm2
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2
+
+// CHECK: {evex}  t2rpntlvwz1t1     -32(,%rbp,2), %tmm2
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2
+
 // CHECK: ttransposed     %tmm1, %tmm5
 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
           ttransposed %tmm1, %tmm5
diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s
index a772232ddbbf2..0d2c22f67a173 100644
--- a/llvm/test/MC/X86/amx-transpose-intel.s
+++ b/llvm/test/MC/X86/amx-transpose-intel.s
@@ -48,6 +48,54 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
           t2rpntlvwz1t1 tmm2, [2*rbp - 32]
 
+// CHECK: t2rpntlvwz0     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz0     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz0t1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz0t1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz0t1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1 tmm2, [2*rbp - 32]
+
+// CHECK: t2rpntlvwz1t1     tmm4, [r16 + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10]
+          t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456]
+
+// CHECK: t2rpntlvwz1t1     tmm2, [r8 + 4*r17 + 291]
+// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00]
+          t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291]
+
+// CHECK: {evex} t2rpntlvwz1t1     tmm2, [2*rbp - 32]
+// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff]
+          {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32]
+
 // CHECK: ttransposed     tmm5, tmm1
 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9]
           ttransposed tmm5, tmm1
diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc
index 55d392f5e271f..4f64d4b8d93d0 100644
--- a/llvm/test/TableGen/x86-instr-mapping.inc
+++ b/llvm/test/TableGen/x86-instr-mapping.inc
@@ -167,6 +167,16 @@ static const X86TableEntry X86CompressEVEXTable[] = {
   { X86::SHRX64rm_EVEX, X86::SHRX64rm },
   { X86::SHRX64rr_EVEX, X86::SHRX64rr },
   { X86::STTILECFG_EVEX, X86::STTILECFG },
+  { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 },
+  { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS },
+  { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 },
+  { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 },
+  { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 },
+  { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS },
+  { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 },
+  { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 },
+  { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 },
+  { X86::TILELOADDRS_EVEX, X86::TILELOADDRS },
   { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 },
   { X86::TILELOADD_EVEX, X86::TILELOADD },
   { X86::TILESTORED_EVEX, X86::TILESTORED },

From ee4282259d5993dfa0b7b8937541dd6ccaadf3d5 Mon Sep 17 00:00:00 2001
From: Nicholas <45984215+liusy58@users.noreply.github.com>
Date: Fri, 17 Jan 2025 17:55:55 +0800
Subject: [PATCH 17/45] [BOLT][AArch64]support `inline-small-functions` for
 AArch64 (#120187)

Add some functions in `AArch64MCPlusBuilder.cpp` to support inline for
AArch64.
---
 bolt/lib/Passes/Inliner.cpp                   |  4 +-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 30 ++++++++++++
 bolt/test/AArch64/inline-small-function-1.s   | 42 ++++++++++++++++
 bolt/test/AArch64/inline-small-function-2.s   | 48 +++++++++++++++++++
 4 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 bolt/test/AArch64/inline-small-function-1.s
 create mode 100644 bolt/test/AArch64/inline-small-function-2.s

diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index f004a8eeea185..1793f4ff1f148 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -310,13 +310,13 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
       if (MIB.isPseudo(Inst))
         continue;
 
-      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86());
+      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86() || BC.isAArch64());
 
       // Fix branch target. Strictly speaking, we don't have to do this as
       // targets of direct branches will be fixed later and don't matter
       // in the CFG state. However, disassembly may look misleading, and
       // hence we do the fixing.
-      if (MIB.isBranch(Inst)) {
+      if (MIB.isBranch(Inst) && !MIB.isTailCall(Inst)) {
         assert(!MIB.isIndirectBranch(Inst) &&
                "unexpected indirect branch in callee");
         const BinaryBasicBlock *TargetBB =
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index d752751c17932..d84da10b5bbe6 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -133,6 +133,36 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
   using MCPlusBuilder::MCPlusBuilder;
 
+  MCPhysReg getStackPointer() const override { return AArch64::SP; }
+
+  bool isPush(const MCInst &Inst) const override { return false; }
+
+  bool isPop(const MCInst &Inst) const override { return false; }
+
+  void createCall(MCInst &Inst, const MCSymbol *Target,
+                  MCContext *Ctx) override {
+    createDirectCall(Inst, Target, Ctx, false);
+  }
+
+  bool convertTailCallToCall(MCInst &Inst) override {
+    int NewOpcode;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case AArch64::B:
+      NewOpcode = AArch64::BL;
+      break;
+    case AArch64::BR:
+      NewOpcode = AArch64::BLR;
+      break;
+    }
+
+    Inst.setOpcode(NewOpcode);
+    removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
+    clearOffset(Inst);
+    return true;
+  }
+
   bool equals(const MCTargetExpr &A, const MCTargetExpr &B,
               CompFuncTy Comp) const override {
     const auto &AArch64ExprA = cast<AArch64MCExpr>(A);
diff --git a/bolt/test/AArch64/inline-small-function-1.s b/bolt/test/AArch64/inline-small-function-1.s
new file mode 100644
index 0000000000000..3ea22a9915fb4
--- /dev/null
+++ b/bolt/test/AArch64/inline-small-function-1.s
@@ -0,0 +1,42 @@
+## This test checks that inline is properly handled by BOLT on aarch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-small-functions --print-inline  --print-only=_Z3barP1A  \
+# RUN: %t.exe -o %t.bolt  | FileCheck %s
+
+# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes.
+# CHECK: Binary Function "_Z3barP1A" after inlining {
+# CHECK-NOT: bl	_Z3fooP1A
+# CHECK: ldr	x8, [x0]
+# CHECK-NEXT: ldr	w0, [x8]
+ 
+	.text
+	.globl	_Z3fooP1A                      
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:                              
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	ret
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A                       
+	.type	_Z3barP1A,@function
+_Z3barP1A:                              
+	stp	x29, x30, [sp, #-16]!           
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16             
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main                            
+	.p2align	2
+	.type	main,@function
+main:                                   
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-small-function-2.s b/bolt/test/AArch64/inline-small-function-2.s
new file mode 100644
index 0000000000000..5eb7d391fd157
--- /dev/null
+++ b/bolt/test/AArch64/inline-small-function-2.s
@@ -0,0 +1,48 @@
+## This test checks that inline is properly handled by BOLT on aarch64.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-small-functions --print-inline  --print-only=test  \
+# RUN: %t.exe -o %t.bolt | FileCheck %s
+
+#CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes.
+#CHECK: Binary Function "test" after inlining {
+#CHECK-NOT: bl	indirect
+#CHECK: add	w0, w1, w0
+#CHECK-NEXT: blr	x2
+ 
+	.text
+	.globl	indirect                       
+	.type	indirect,@function
+indirect:                               
+	add	w0, w1, w0
+	br	x2
+	.size	indirect, .-indirect
+
+	.globl	test                           
+	.type	test,@function
+test:                                   
+	stp	x29, x30, [sp, #-32]!          
+	stp	x20, x19, [sp, #16]            
+	mov	x29, sp
+	mov	w19, w1
+	mov	w20, w0
+	bl	indirect
+	add	w8, w19, w20
+	cmp	w0, #0
+	csinc	w0, w8, wzr, eq
+	ldp	x20, x19, [sp, #16]             
+	ldp	x29, x30, [sp], #32            
+	ret
+	.size	test, .-test
+
+	.globl	main                            
+	.type	main,@function
+main:                                   
+	mov	w0, wzr
+	ret
+	.size	main, .-main
+
+ 
\ No newline at end of file

From 3b3590aa59f6ba35c746c01c0692621494b62cab Mon Sep 17 00:00:00 2001
From: Sushant Gokhale <sgokhale@nvidia.com>
Date: Fri, 17 Jan 2025 02:05:05 -0800
Subject: [PATCH 18/45] Revert "Revert "[InstCombine] Transform high latency,
 dependent FSQRT/FDIV into FMUL"" (#123313)

Reverts llvm/llvm-project#123289
---
 .../InstCombine/InstCombineMulDivRem.cpp      | 176 +++++
 .../InstCombine/fsqrtdiv-transform.ll         | 631 ++++++++++++++++++
 2 files changed, 807 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d0b2ded127ff7..b6acde9bdd110 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombineInternal.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -657,6 +658,94 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
   return nullptr;
 }
 
+// If we have the following pattern,
+// X = 1.0/sqrt(a)
+// R1 = X * X
+// R2 = a/sqrt(a)
+// then this method collects all the instructions that match R1 and R2.
+static bool getFSqrtDivOptPattern(Instruction *Div,
+                                  SmallPtrSetImpl<Instruction *> &R1,
+                                  SmallPtrSetImpl<Instruction *> &R2) {
+  Value *A;
+  if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
+      match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
+    for (User *U : Div->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FMul(m_Specific(Div), m_Specific(Div))))
+        R1.insert(I);
+    }
+
+    CallInst *CI = cast<CallInst>(Div->getOperand(1));
+    for (User *U : CI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A)))))
+        R2.insert(I);
+    }
+  }
+  return !R1.empty() && !R2.empty();
+}
+
+// Check legality for transforming
+// x = 1.0/sqrt(a)
+// r1 = x * x;
+// r2 = a/sqrt(a);
+//
+// TO
+//
+// r1 = 1/a
+// r2 = sqrt(a)
+// x = r1 * r2
+// This transform works only when 'a' is known positive.
+static bool isFSqrtDivToFMulLegal(Instruction *X,
+                                  SmallPtrSetImpl<Instruction *> &R1,
+                                  SmallPtrSetImpl<Instruction *> &R2) {
+  // Check if the required pattern for the transformation exists.
+  if (!getFSqrtDivOptPattern(X, R1, R2))
+    return false;
+
+  BasicBlock *BBx = X->getParent();
+  BasicBlock *BBr1 = (*R1.begin())->getParent();
+  BasicBlock *BBr2 = (*R2.begin())->getParent();
+
+  CallInst *FSqrt = cast<CallInst>(X->getOperand(1));
+  if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() ||
+      !FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs())
+    return false;
+
+  // We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed
+  // by recip fp as it is strictly meant to transform ops of type a/b to
+  // a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag
+  // has been used(rather abused)in the past for algebraic rewrites.
+  if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs())
+    return false;
+
+  // Check the constraints on X, R1 and R2 combined.
+  // fdiv instruction and one of the multiplications must reside in the same
+  // block. If not, the optimized code may execute more ops than before and
+  // this may hamper the performance.
+  if (BBx != BBr1 && BBx != BBr2)
+    return false;
+
+  // Check the constraints on instructions in R1.
+  if (any_of(R1, [BBr1](Instruction *I) {
+        // When you have multiple instructions residing in R1 and R2
+        // respectively, it's difficult to generate combinations of (R1,R2) and
+        // then check if we have the required pattern. So, for now, just be
+        // conservative.
+        return (I->getParent() != BBr1 || !I->hasAllowReassoc());
+      }))
+    return false;
+
+  // Check the constraints on instructions in R2.
+  return all_of(R2, [BBr2](Instruction *I) {
+    // When you have multiple instructions residing in R1 and R2
+    // respectively, it's difficult to generate combination of (R1,R2) and
+    // then check if we have the required pattern. So, for now, just be
+    // conservative.
+    return (I->getParent() == BBr2 && I->hasAllowReassoc());
+  });
+}
+
 Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0);
   Value *Op1 = I.getOperand(1);
@@ -1913,6 +2002,75 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
   return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
 }
 
+// Change
+// X = 1/sqrt(a)
+// R1 = X * X
+// R2 = a * X
+//
+// TO
+//
+// FDiv = 1/a
+// FSqrt = sqrt(a)
+// FMul = FDiv * FSqrt
+// Replace Uses Of R1 With FDiv
+// Replace Uses Of R2 With FSqrt
+// Replace Uses Of X With FMul
+static Instruction *
+convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
+                        const SmallPtrSetImpl<Instruction *> &R1,
+                        const SmallPtrSetImpl<Instruction *> &R2,
+                        InstCombiner::BuilderTy &B, InstCombinerImpl *IC) {
+
+  B.SetInsertPoint(X);
+
+  // Have an instruction that is representative of all of instructions in R1 and
+  // get the most common fpmath metadata and fast-math flags on it.
+  Value *SqrtOp = CI->getArgOperand(0);
+  auto *FDiv = cast<Instruction>(
+      B.CreateFDiv(ConstantFP::get(X->getType(), 1.0), SqrtOp));
+  auto *R1FPMathMDNode = (*R1.begin())->getMetadata(LLVMContext::MD_fpmath);
+  FastMathFlags R1FMF = (*R1.begin())->getFastMathFlags(); // Common FMF
+  for (Instruction *I : R1) {
+    R1FPMathMDNode = MDNode::getMostGenericFPMath(
+        R1FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath));
+    R1FMF &= I->getFastMathFlags();
+    IC->replaceInstUsesWith(*I, FDiv);
+    IC->eraseInstFromFunction(*I);
+  }
+  FDiv->setMetadata(LLVMContext::MD_fpmath, R1FPMathMDNode);
+  FDiv->copyFastMathFlags(R1FMF);
+
+  // Have a single sqrt call instruction that is representative of all of
+  // instructions in R2 and get the most common fpmath metadata and fast-math
+  // flags on it.
+  auto *FSqrt = cast<CallInst>(CI->clone());
+  FSqrt->insertBefore(CI);
+  auto *R2FPMathMDNode = (*R2.begin())->getMetadata(LLVMContext::MD_fpmath);
+  FastMathFlags R2FMF = (*R2.begin())->getFastMathFlags(); // Common FMF
+  for (Instruction *I : R2) {
+    R2FPMathMDNode = MDNode::getMostGenericFPMath(
+        R2FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath));
+    R2FMF &= I->getFastMathFlags();
+    IC->replaceInstUsesWith(*I, FSqrt);
+    IC->eraseInstFromFunction(*I);
+  }
+  FSqrt->setMetadata(LLVMContext::MD_fpmath, R2FPMathMDNode);
+  FSqrt->copyFastMathFlags(R2FMF);
+
+  Instruction *FMul;
+  // If X = -1/sqrt(a) initially,then FMul = -(FDiv * FSqrt)
+  if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
+    Value *Mul = B.CreateFMul(FDiv, FSqrt);
+    FMul = cast<Instruction>(B.CreateFNeg(Mul));
+  } else
+    FMul = cast<Instruction>(B.CreateFMul(FDiv, FSqrt));
+  FMul->copyMetadata(*X);
+  FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) |
+                          FastMathFlags::unionValue(R1FMF, R2FMF));
+  IC->replaceInstUsesWith(*X, FMul);
+  return IC->eraseInstFromFunction(*X);
+}
+
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   Module *M = I.getModule();
 
@@ -1937,6 +2095,24 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
     return R;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Convert
+  // x = 1.0/sqrt(a)
+  // r1 = x * x;
+  // r2 = a/sqrt(a);
+  //
+  // TO
+  //
+  // r1 = 1/a
+  // r2 = sqrt(a)
+  // x = r1 * r2
+  SmallPtrSet<Instruction *, 2> R1, R2;
+  if (isFSqrtDivToFMulLegal(&I, R1, R2)) {
+    CallInst *CI = cast<CallInst>(I.getOperand(1));
+    if (Instruction *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, Builder, this))
+      return D;
+  }
+
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
new file mode 100644
index 0000000000000..6296954333e8a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
@@ -0,0 +1,631 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='instcombine<no-verify-fixpoint>' < %s | FileCheck %s
+
+@x = global double 0.000000e+00
+@r1 = global double 0.000000e+00
+@r2 = global double 0.000000e+00
+@r3 = global double 0.000000e+00
+@v = global [2 x double] zeroinitializer
+@v1 = global [2 x double] zeroinitializer
+@v2 = global [2 x double] zeroinitializer
+
+; div/mul/div1 in the same block.
+define void @bb_constraint_case1(double %a) {
+; CHECK-LABEL: define void @bb_constraint_case1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div/mul in one block and div1 in other block with conditional guard.
+define void @bb_constraint_case2(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case2(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 in other block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case3(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case3(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 each in different block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case4(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case4(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END1]]
+; CHECK:       if.end1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end1, label %if.then1
+
+if.then1:                                         ; preds = %if.end
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end1
+
+if.end1:                                          ; preds = %if.then1, %if.end
+  ret void
+}
+
+; sqrt value comes from different blocks. Don't optimize.
+define void @bb_constraint_case5(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case5(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[A]], 1.000000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[ADD]])
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SQRT:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %add = fadd double %a, 1.000000e+01
+  %1 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %add)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sqrt = phi double[ %0, %if.then], [ %1, %if.else]
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div in one block and conditionally guarded. mul/div1 in other block. Don't optimize.
+define void @bb_constraint_case6(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case6(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[TMP1]], ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV:%.*]] = phi double [ [[TMP0]], [[IF_ELSE]] ], [ [[TMP1]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else, label %if.then
+
+if.else:                                          ; preds = %entry
+  %1 = load double, ptr @x
+  br label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %2, ptr @x
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %div = phi double [ %1, %if.else ], [ %2, %if.then ]
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value for mul comes from different blocks. Don't optimize.
+define void @bb_constraint_case7(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case7(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv double 3.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double 2.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else1:
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[MUL:%.*]] = phi double [ [[TMP1]], [[IF_THEN1]] ], [ [[TMP2]], [[IF_ELSE1]] ], [ [[TMP0]], [[IF_THEN]] ]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = fdiv double 3.000000e+00, %a
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else1, label %if.then1
+
+if.then1:                                         ; preds = %if.else
+  %2 = fdiv double 2.000000e+00, %a
+  br label %if.end
+
+if.else1:                                         ; preds = %if.else
+  %3 = fmul reassoc double %div, %div
+  br label %if.end
+
+if.end:                                           ; preds = %if.then1, %if.else1, %if.then
+  %mul = phi double [ %2, %if.then1 ], [ %3, %if.else1 ], [ %1, %if.then ]
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value of mul  comes from two different blocks(as shown by select ins).
+define void @bb_constraint_case8(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case8(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP0]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul reassoc double %div, %div
+  %mul = select i1 %c.not, double %1, double %2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; multiple instances of multiply ops to optimize. Optimize all.
+define void @mutiple_multiply_instances(double %a, i32 %c) {
+; CHECK-LABEL: define void @mutiple_multiply_instances(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP1]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL1:%.*]] = select i1 [[C_NOT]], double [[TMP2]], double [[TMP1]]
+; CHECK-NEXT:    [[MUL2:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP3]]
+; CHECK-NEXT:    store double [[MUL1]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[MUL2]], ptr @r3, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul double %a, %a
+  %3 = fmul reassoc double %div, %div
+  %4 = fmul reassoc double %div, %div
+  %mul1 = select i1 %c.not, double %1, double %3
+  %mul2 = select i1 %c.not, double %4, double %2
+  store double %mul1, ptr @r1
+  store double %mul2, ptr @r3
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_arcp_flag_on_div(double %a) {
+; CHECK-LABEL: define void @missing_arcp_flag_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_mul(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_div1(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div = -1/sqrt(a)
+define void @negative_fdiv_val(double %a) {
+; CHECK-LABEL: define void @negative_fdiv_val(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg reassoc double [[SQRT1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double -1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_div1(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_mul(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; FIXME: DIV in the result should get the fpmath metadata from %div.
+define void @fpmath_metadata_on_div(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2:![0-9]+]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @fpmath_metadata_on_all(double %a) {
+; CHECK-LABEL: define void @fpmath_metadata_on_all(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a), !fpmath !0
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @vector_input(<2 x double> %a) {
+; CHECK-LABEL: define void @vector_input(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc <2 x double> splat (double 1.000000e+00), [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul reassoc <2 x double> [[TMP0]], [[SQRT1]]
+; CHECK-NEXT:    store <2 x double> [[DIV]], ptr @v, align 16
+; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr @v1, align 16
+; CHECK-NEXT:    store <2 x double> [[SQRT1]], ptr @v2, align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sqrt = call reassoc nnan nsz ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
+  %div = fdiv reassoc arcp ninf <2 x double><double 1.000000e+00, double 1.000000e+00>, %sqrt
+  store <2 x double> %div, ptr @v
+  %mul = fmul reassoc <2 x double> %div, %div
+  store <2 x double> %mul, ptr @v1
+  %div1 = fdiv reassoc <2 x double> %a, %sqrt
+  store <2 x double> %div1, ptr @v2
+  ret void
+}
+
+define void @strict_fp_metadata(double %a) {
+; CHECK-LABEL: define void @strict_fp_metadata(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.sqrt.f64(double noundef [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[DIV]], double [[DIV]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[A]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict")
+; CHECK-NEXT:    store double [[DIV2]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  %call = call double @llvm.sqrt.f64(double noundef %a)
+  %div = call double @llvm.experimental.constrained.fdiv.f64(double %conv, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %div, ptr @x
+  %mul = call double @llvm.experimental.constrained.fmul.f64(double %div, double %div, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %mul, ptr @r1
+  %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %div2, ptr @r2
+  ret void
+}
+
+declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata)
+declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
+declare double @llvm.sqrt.f64(double)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+!0 = !{float 2.5}
+!1 = !{float 3.5}
+!2 = !{float 4.5}
+!3 = !{float 5.5}
+; CHECK: [[META0]] = !{float 5.500000e+00}
+; CHECK: [[META1]] = !{float 4.500000e+00}
+; CHECK: [[META2]] = !{float 3.500000e+00}

From 2c9dc089fd6aeb7570206b0a8b36cfb9298c2893 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 17 Jan 2025 10:09:31 +0000
Subject: [PATCH 19/45] [AArch64] Use spill size when calculating callee saves
 size (NFC) (#123086)

This is an NFC right now, as currently, all register and spill sizes are
the same, but the spill size is the correct size to use here.
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 206e410047db5..dd248cf39a5ce 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3795,14 +3795,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned CSStackSize = 0;
   unsigned SVECSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned Reg : SavedRegs.set_bits()) {
-    auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
+    auto *RC = TRI->getMinimalPhysRegClass(Reg);
+    assert(RC && "expected register class!");
+    auto SpillSize = TRI->getSpillSize(*RC);
     if (AArch64::PPRRegClass.contains(Reg) ||
         AArch64::ZPRRegClass.contains(Reg))
-      SVECSStackSize += RegSize;
+      SVECSStackSize += SpillSize;
     else
-      CSStackSize += RegSize;
+      CSStackSize += SpillSize;
   }
 
   // Increase the callee-saved stack size if the function has streaming mode

From 32a4650f3c76efee3bd515e25d70ae39d980b071 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 17 Jan 2025 10:10:21 +0000
Subject: [PATCH 20/45] [AArch64] Avoid hardcoding spill size/align in
 FrameLowering (NFC) (#123080)

This is already defined for each register class in AArch64RegisterInfo,
not hardcoding it here makes these values easier to change (perhaps
based on hardware mode).
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 69 ++++++-------------
 1 file changed, 20 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index dd248cf39a5ce..1582d1999ca1d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2926,26 +2926,12 @@ struct RegPairInfo {
   int FrameIdx;
   int Offset;
   enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
+  const TargetRegisterClass *RC;
 
   RegPairInfo() = default;
 
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
 
-  unsigned getScale() const {
-    switch (Type) {
-    case PPR:
-      return 2;
-    case GPR:
-    case FPR64:
-    case VG:
-      return 8;
-    case ZPR:
-    case FPR128:
-      return 16;
-    }
-    llvm_unreachable("Unsupported type");
-  }
-
   bool isScalable() const { return Type == PPR || Type == ZPR; }
 };
 
@@ -3023,20 +3009,27 @@ static void computeCalleeSaveRegisterPairs(
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
 
-    if (AArch64::GPR64RegClass.contains(RPI.Reg1))
+    if (AArch64::GPR64RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::GPR;
-    else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::GPR64RegClass;
+    } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::FPR64;
-    else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::FPR64RegClass;
+    } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::FPR128;
-    else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::FPR128RegClass;
+    } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::ZPR;
-    else if (AArch64::PPRRegClass.contains(RPI.Reg1))
+      RPI.RC = &AArch64::ZPRRegClass;
+    } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) {
       RPI.Type = RegPairInfo::PPR;
-    else if (RPI.Reg1 == AArch64::VG)
+      RPI.RC = &AArch64::PPRRegClass;
+    } else if (RPI.Reg1 == AArch64::VG) {
       RPI.Type = RegPairInfo::VG;
-    else
+      RPI.RC = &AArch64::FIXED_REGSRegClass;
+    } else {
       llvm_unreachable("Unsupported register class.");
+    }
 
     // Add the stack hazard size as we transition from GPR->FPR CSRs.
     if (AFI->hasStackHazardSlotIndex() &&
@@ -3045,7 +3038,7 @@ static void computeCalleeSaveRegisterPairs(
       ByteOffset += StackFillDir * StackHazardSize;
     LastReg = RPI.Reg1;
 
-    int Scale = RPI.getScale();
+    int Scale = TRI->getSpillSize(*RPI.RC);
     // Add the next reg to the pair if it is in the same register class.
     if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
       Register NextReg = CSI[i + RegInc].getReg();
@@ -3254,38 +3247,26 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
     // Note: Similar rationale and sequence for restores in epilog.
-    unsigned Size;
-    Align Alignment;
+    unsigned Size = TRI->getSpillSize(*RPI.RC);
+    Align Alignment = TRI->getSpillAlign(*RPI.RC);
     switch (RPI.Type) {
     case RegPairInfo::GPR:
       StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR64:
       StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR128:
       StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::ZPR:
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::PPR:
       StrOpc = AArch64::STR_PXI;
-      Size = 2;
-      Alignment = Align(2);
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     }
 
@@ -3495,33 +3476,23 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
-    unsigned Size;
-    Align Alignment;
+    unsigned Size = TRI->getSpillSize(*RPI.RC);
+    Align Alignment = TRI->getSpillAlign(*RPI.RC);
     switch (RPI.Type) {
     case RegPairInfo::GPR:
       LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR64:
       LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
-      Size = 8;
-      Alignment = Align(8);
       break;
     case RegPairInfo::FPR128:
       LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::ZPR:
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
-      Size = 16;
-      Alignment = Align(16);
       break;
     case RegPairInfo::PPR:
       LdrOpc = AArch64::LDR_PXI;
-      Size = 2;
-      Alignment = Align(2);
       break;
     case RegPairInfo::VG:
       continue;

From e79bb8731ae9089f0635e5634883267a091e318d Mon Sep 17 00:00:00 2001
From: Sushant Gokhale <sgokhale@nvidia.com>
Date: Fri, 17 Jan 2025 02:14:04 -0800
Subject: [PATCH 21/45] [InstCombine] Fixup commit 7253c6f (#123315)

This should fix the assert failure we were getting for the darwin OS.
---
 llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index b6acde9bdd110..df5f9833a2ff9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -2067,8 +2067,7 @@ convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
   FMul->copyMetadata(*X);
   FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) |
                           FastMathFlags::unionValue(R1FMF, R2FMF));
-  IC->replaceInstUsesWith(*X, FMul);
-  return IC->eraseInstFromFunction(*X);
+  return IC->replaceInstUsesWith(*X, FMul);
 }
 
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {

From 9491f75e1d912b277247450d1c7b6d56f7faf885 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Fri, 17 Jan 2025 10:34:57 +0000
Subject: [PATCH 22/45] Reland: [LV]: Teach LV to recursively (de)interleave.
 (#122989)

This commit relands the changes from "[LV]: Teach LV to recursively
(de)interleave. #89018"

Reason for revert:
- The patch exposed a bug in the IA pass, the bug is now fixed and landed by commit: #122643
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   14 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   79 +-
 .../AArch64/sve-interleaved-accesses.ll       |  260 +++-
 .../sve-interleaved-masked-accesses.ll        |  252 ++++
 .../RISCV/interleaved-accesses.ll             | 1318 +++++++++--------
 .../AArch64/sve-interleave-vectorization.ll   |  135 ++
 6 files changed, 1387 insertions(+), 671 deletions(-)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8024cde41b5f9..6df11abda9e98 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3505,10 +3505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
-  // We currently only know how to emit interleave/deinterleave with
-  // Factor=2 for scalable vectors. This is purely an implementation
-  // limit.
-  if (VF.isScalable() && InterleaveFactor != 2)
+  // For scalable vectors, the only interleave factor currently supported
+  // must be power of 2 since we require the (de)interleave2 intrinsics
+  // instead of shufflevectors.
+  if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
     return false;
 
   // If the group involves a non-integral pointer, we may not be able to
@@ -9435,9 +9435,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
       // For scalable vectors, the only interleave factor currently supported
-      // is 2 since we require the (de)interleave2 intrinsics instead of
-      // shufflevectors.
-      assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+      // must be power of 2 since we require the (de)interleave2 intrinsics
+      // instead of shufflevectors.
+      assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 979a8e0768a99..5ae2f43e4950c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2863,10 +2863,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
-    VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
-    return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
-                                   Vals,
-                                   /*FMFSource=*/nullptr, Name);
+    assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
+                                    "scalable vectors, must be power of 2");
+    SmallVector<Value *> InterleavingValues(Vals);
+    // When interleaving, the number of values will be shrunk until we have the
+    // single final interleaved value.
+    auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
+    for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
+      InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
+      for (unsigned I = 0; I < Midpoint; ++I)
+        InterleavingValues[I] = Builder.CreateIntrinsic(
+            InterleaveTy, Intrinsic::vector_interleave2,
+            {InterleavingValues[I], InterleavingValues[Midpoint + I]},
+            /*FMFSource=*/nullptr, Name);
+    }
+    return InterleavingValues[0];
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2952,15 +2963,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(InterleaveFactor == 2 &&
+      assert(isPowerOf2_32(InterleaveFactor) &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
-      SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
-      auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
-                                     State.VF.getKnownMinValue() * 2, true);
-      return State.Builder.CreateIntrinsic(
-          MaskTy, Intrinsic::vector_interleave2, Ops,
-          /*FMFSource=*/nullptr, "interleaved.mask");
+      SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+      return interleaveVectors(State.Builder, Ops, "interleaved.mask");
     }
 
     if (!BlockInMask)
@@ -3000,22 +3007,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     ArrayRef<VPValue *> VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
-      assert(InterleaveFactor == 2 &&
+      assert(isPowerOf2_32(InterleaveFactor) &&
              "Unsupported deinterleave factor for scalable vectors");
 
-        // Scalable vectors cannot use arbitrary shufflevectors (only splats),
-        // so must use intrinsics to deinterleave.
-      Value *DI = State.Builder.CreateIntrinsic(
-          Intrinsic::vector_deinterleave2, VecTy, NewLoad,
-          /*FMFSource=*/nullptr, "strided.vec");
-      unsigned J = 0;
-      for (unsigned I = 0; I < InterleaveFactor; ++I) {
-        Instruction *Member = Group->getMember(I);
+      // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+      // so must use intrinsics to deinterleave.
+      SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
+      DeinterleavedValues[0] = NewLoad;
+      // For the case of InterleaveFactor > 2, we will have to do recursive
+      // deinterleaving, because the current available deinterleave intrinsic
+      // supports only Factor of 2, otherwise it will bailout after first
+      // iteration.
+      // When deinterleaving, the number of values will double until we
+      // have "InterleaveFactor".
+      for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+           NumVectors *= 2) {
+        // Deinterleave the elements within the vector
+        SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+        for (unsigned I = 0; I < NumVectors; ++I) {
+          auto *DiTy = DeinterleavedValues[I]->getType();
+          TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+              Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+              /*FMFSource=*/nullptr, "strided.vec");
+        }
+        // Extract the deinterleaved values:
+        for (unsigned I = 0; I < 2; ++I)
+          for (unsigned J = 0; J < NumVectors; ++J)
+            DeinterleavedValues[NumVectors * I + J] =
+                State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
+      }
 
-        if (!Member)
+#ifndef NDEBUG
+      for (Value *Val : DeinterleavedValues)
+        assert(Val && "NULL Deinterleaved Value");
+#endif
+      for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+        Instruction *Member = Group->getMember(I);
+        Value *StridedVec = DeinterleavedValues[I];
+        if (!Member) {
+          // This value is not needed as it's not used
+          cast<Instruction>(StridedVec)->eraseFromParent();
           continue;
-
-        Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+        }
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
           VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index bf95622733461..05c0bc0761ea4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1548,5 +1548,263 @@ end:
   ret void
 }
 
+; Check vectorization on an interleaved load/store groups of factor 4
+
+; for (int i = 0; i < 1024; ++i) {
+;   dst[i].x = a[i].x + b[i].x;
+;   dst[i].y = a[i].y - b[i].y;
+;   dst[i].z = a[i].z << b[i].z;
+;   dst[i].t = a[i].t >> b[i].t;
+; }
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx5, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
+  %2 = load i32, ptr %y, align 4
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
+  %3 = load i32, ptr %y11, align 4
+  %sub = sub nsw i32 %2, %3
+  %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
+  store i32 %sub, ptr %y14, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
+  %4 = load i32, ptr %z, align 4
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
+  %5 = load i32, ptr %z19, align 4
+  %shl = shl i32 %4, %5
+  %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
+  store i32 %shl, ptr %z22, align 4
+  %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
+  %6 = load i32, ptr %t, align 4
+  %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
+  %7 = load i32, ptr %t27, align 4
+  %shr = ashr i32 %6, %7
+  %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
+  store i32 %shr, ptr %t30, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Check vectorization on a reverse interleaved load/store groups of factor 4
+
+; for (int i = 1023; i >= 0; i--) {
+;   int a = A[i].x + i;
+;   int b = A[i].y - i;
+;   int c = A[i].z * i;
+;   int d = A[i].t << i;
+;   B[i].x = a;
+;   B[i].y = b;
+;   B[i].z = c;
+;   B[i].t = d;
+; }
+
+define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
+; CHECK-LABEL: @interleave_deinterleave_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT:    [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
+; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
+; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]]
+;
+entry:
+  br label %for.body
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0
+  %load1 = load i32, ptr %x, align 4
+  %trunc = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %load1, %trunc
+  %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1
+  %load2 = load i32, ptr %y, align 4
+  %sub = sub nsw i32 %load2, %trunc
+  %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2
+  %load3 = load i32, ptr %z, align 4
+  %mul = mul nsw i32 %load3, %trunc
+  %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3
+  %load4 = load i32, ptr %t, align 4
+  %shl = shl nuw nsw i32 %load4, %trunc
+  %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0
+  store i32 %add, ptr %x5, align 4
+  %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1
+  store i32 %sub, ptr %y8, align 4
+  %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2
+  store i32 %mul, ptr %z5, align 4
+  %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3
+  store i32 %shl, ptr %t8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+}
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
 attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 1a281fe7c6f7f..d4392bebdf37b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -529,3 +529,255 @@ for.inc:
 for.end:
   ret void
 }
+
+; Expected to contain interleave2/deinterleave2 instructions
+;
+; void masked_strided_factor4(const unsigned char* restrict p,
+;                            unsigned char* restrict q,
+;                            unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left1 = p[4*ix];
+;         char right1 = p[4*ix + 1];
+;         char left2 = p[4*ix + 2];
+;         char right2 = p[4*ix + 3];
+;         char max1 = max(left1, right1);
+;         char max2 = max(left2, right2);
+;         q[4*ix] = max1;
+;         q[4*ix + 1] = 0 - max1;
+;         q[4*ix + 2] = max2;
+;         q[4*ix + 3] = 0 - max2;
+;     }
+; }
+;}
+define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
+; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; SCALAR_TAIL_FOLDING-NEXT:  entry:
+; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALAR_TAIL_FOLDING:       vector.ph:
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALAR_TAIL_FOLDING:       vector.body:
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING:       middle.block:
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALAR_TAIL_FOLDING:       scalar.ph:
+; SCALAR_TAIL_FOLDING-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALAR_TAIL_FOLDING:       for.body:
+; SCALAR_TAIL_FOLDING-NEXT:    [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; SCALAR_TAIL_FOLDING:       if.then:
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]]
+; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
+; SCALAR_TAIL_FOLDING:       for.inc:
+; SCALAR_TAIL_FOLDING-NEXT:    [[INC]] = add nuw nsw i32 [[IX_024]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALAR_TAIL_FOLDING:       for.end:
+; SCALAR_TAIL_FOLDING-NEXT:    ret void
+;
+; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4
+; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       vector.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       middle.block:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
+; PREDICATED_TAIL_FOLDING:       for.body:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; PREDICATED_TAIL_FOLDING:       if.then:
+; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
+; PREDICATED_TAIL_FOLDING:       for.inc:
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %idx0 = shl nuw nsw i32 %ix.024, 2
+  %idx1 = add i32 %idx0, 1
+  %idx2 = add i32 %idx0, 2
+  %idx3 = add i32 %idx0, 3
+
+  %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0
+  %0 = load i8, ptr %array1idx0, align 1
+  %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1
+  %1 = load i8, ptr %array1idx1, align 1
+  %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2
+  %2 = load i8, ptr %array1idx2, align 1
+  %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3
+  %3 = load i8, ptr %array1idx3, align 1
+
+  %cmp.i1 = icmp slt i8 %0, %1
+  %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0
+  %sub1 = sub i8 0, %spec.select.i1
+  %cmp.i2 = icmp slt i8 %2, %3
+  %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2
+  %sub2 = sub i8 0, %spec.select.i2
+
+  %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0
+  store i8 %spec.select.i1, ptr %array3idx0, align 1
+  %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1
+  store i8 %sub1, ptr %array3idx1, align 1
+  %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2
+  store i8 %spec.select.i2, ptr %array3idx2, align 1
+  %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3
+  store i8 %sub2, ptr %array3idx3, align 1
+
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index bda4839dead51..b1ff589fe51bf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; FIXED-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP11]], splat (i32 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]])
+; SCALABLE-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
 ; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; FIXED-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP15]])
-; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 2)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]])
+; SCALABLE-NEXT:    store <vscale x 4 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
 ; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -360,42 +360,42 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; FIXED-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q3]], align 4
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q3]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
@@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
 ; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -550,42 +550,42 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor3_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; FIXED-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; FIXED-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 4
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; SCALABLE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET3:%.*]] = mul i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q3]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q3]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
 ; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -740,56 +740,75 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
+; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; CHECK-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @load_store_factor8(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
 ; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
 ; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
@@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
 ; FIXED-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
 ; FIXED-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; FIXED-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; FIXED-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; FIXED-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; FIXED-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; FIXED-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FIXED-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; FIXED-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; FIXED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; FIXED-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
+; FIXED-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
+; FIXED-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
+; FIXED-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
+; FIXED-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
+; FIXED-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
+; FIXED-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
+; FIXED-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
+; FIXED-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; FIXED-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; FIXED-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; FIXED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 2
+; FIXED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; FIXED-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
 ; SCALABLE-LABEL: @load_store_factor8(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 0, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 1, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 2, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 3, i32 11>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 4, i32 12>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 5, i32 13>
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 6, i32 14>
-; SCALABLE-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> <i32 7, i32 15>
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8)
-; SCALABLE-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; SCALABLE-NEXT:    store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
+; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP7]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
+; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
+; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
+; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP11]])
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
+; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
+; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
+; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
+; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
+; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
+; SCALABLE-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP22:%.*]] = add <vscale x 1 x i64> [[TMP14]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP23:%.*]] = add <vscale x 1 x i64> [[TMP15]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 7)
+; SCALABLE-NEXT:    [[TMP27:%.*]] = add <vscale x 1 x i64> [[TMP19]], splat (i64 8)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP27]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET8:%.*]] = shl i64 [[I1]], 3
+; SCALABLE-NEXT:    [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q8]], align 8
 ; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q8]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
@@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
 ; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i32(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 8
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; FIXED-NEXT:    [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; FIXED-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
-; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8
-; FIXED-NEXT:    store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4
-; FIXED-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; FIXED-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8
+; FIXED-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4
+; FIXED-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 16
+; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
@@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[Q0]], align 4
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
-; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[TMP10]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q2]], align 4
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]]
 ; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; CHECK-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; CHECK-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @combine_load_factor2_i64(
 ; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-NEXT:    br label [[LOOP:%.*]]
 ; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; FIXED-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; FIXED-NEXT:    [[TMP0:%.*]] = add i64 [[I]], 0
+; FIXED-NEXT:    [[TMP1:%.*]] = add i64 [[I]], 4
+; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1
 ; FIXED-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
-; FIXED-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
 ; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
-; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8
+; FIXED-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8
 ; FIXED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
-; FIXED-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; FIXED-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
-; FIXED-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
-; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; FIXED-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
-; FIXED-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
-; FIXED-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
-; FIXED-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; FIXED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
+; FIXED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; FIXED-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; FIXED-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
+; FIXED-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]]
+; FIXED-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; FIXED-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0
+; FIXED-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4
+; FIXED-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8
+; FIXED-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8
+; FIXED-NEXT:    [[NEXTI]] = add nuw i64 [[I]], 8
+; FIXED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024
+; FIXED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; FIXED:       scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; FIXED-NEXT:    br label [[LOOP:%.*]]
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; FIXED-NEXT:    br label [[LOOP1:%.*]]
 ; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; FIXED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; FIXED-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; FIXED-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
 ; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
@@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
-; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[TMP8]], align 8
+; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 0
+; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1
+; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 4 x i64>, ptr [[Q0]], align 8
 ; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
-; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0
-; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP12]], ptr [[TMP14]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP9]], [[TMP10]]
+; SCALABLE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
+; SCALABLE-NEXT:    store <vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], align 8
+; SCALABLE-NEXT:    [[NEXTI]] = add nuw i64 [[I]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SCALABLE-NEXT:    br label [[LOOP:%.*]]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; SCALABLE-NEXT:    br label [[LOOP1:%.*]]
 ; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
+; SCALABLE-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ]
+; SCALABLE-NEXT:    [[OFFSET2:%.*]] = shl i64 [[I1]], 1
+; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
+; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q2]], align 8
+; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1
 ; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
 ; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]]
 ; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; SCALABLE-NEXT:    [[NEXTI1]] = add i64 [[I1]], 1
+; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024
+; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
new file mode 100644
index 0000000000000..362ec22600f92
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+%struct.xyzt = type { i32, i32, i32, i32 }
+; for (int i = 0; i < 1024; ++i) {
+;   dst[i].x = a[i].x + b[i].x;
+;   dst[i].y = a[i].y - b[i].y;
+;   dst[i].z = a[i].z << b[i].z;
+;   dst[i].t = a[i].t >> b[i].t;
+; }
+
+define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LDN9:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN9]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP21]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]]
+; CHECK-NEXT:    [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT:    store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT:    [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT:    store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT:    [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT:    store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv
+  %a.0 = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv
+  %b.0 = load i32, ptr %gep.b, align 4
+  %add = add nsw i32 %b.0, %a.0
+  %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv
+  store i32 %add, ptr %gep.dst, align 4
+  %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4
+  %a.1 = load i32, ptr %gep.a.1, align 4
+  %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4
+  %b.1 = load i32, ptr %gep.b.1, align 4
+  %sub = sub nsw i32 %a.1, %b.1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4
+  store i32 %sub, ptr %gep.dst.1, align 4
+  %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8
+  %a.2 = load i32, ptr %gep.a.2, align 4
+  %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8
+  %b.2 = load i32, ptr %gep.b.2, align 4
+  %shl = shl i32 %a.2, %b.2
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8
+  store i32 %shl, ptr %gep.dst.2, align 4
+  %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12
+  %a.3 = load i32, ptr %gep.a.3, align 4
+  %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12
+  %b.3 = load i32, ptr %gep.b.3, align 4
+  %shr = ashr i32 %a.3, %b.3
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12
+  store i32 %shr, ptr %gep.dst.3, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

From b068f2fd0fefca1ee357483333f034d18e6d8214 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 17 Jan 2025 11:36:12 +0100
Subject: [PATCH 23/45] [LLD][COFF] Process bitcode files separately for each
 symbol table on ARM64X (#123194)

---
 lld/COFF/COFFLinkerContext.h |  1 -
 lld/COFF/Driver.cpp          | 41 ++++++++++++++++---------------
 lld/COFF/InputFiles.cpp      | 19 ++++++++++-----
 lld/COFF/InputFiles.h        | 14 ++++++++---
 lld/COFF/SymbolTable.cpp     | 17 ++++++-------
 lld/COFF/SymbolTable.h       |  8 ++++++
 lld/test/COFF/lto-arm64x.ll  | 47 ++++++++++++++++++++++++++++++++++++
 7 files changed, 108 insertions(+), 39 deletions(-)
 create mode 100644 lld/test/COFF/lto-arm64x.ll

diff --git a/lld/COFF/COFFLinkerContext.h b/lld/COFF/COFFLinkerContext.h
index bdd625b8c3916..8322f829d4055 100644
--- a/lld/COFF/COFFLinkerContext.h
+++ b/lld/COFF/COFFLinkerContext.h
@@ -56,7 +56,6 @@ class COFFLinkerContext : public CommonLinkerContext {
   std::vector<ObjFile *> objFileInstances;
   std::map<std::string, PDBInputFile *> pdbInputFileInstances;
   std::vector<ImportFile *> importFileInstances;
-  std::vector<BitcodeFile *> bitcodeFileInstances;
 
   MergeChunk *mergeChunkInstances[Log2MaxSectionAlignment + 1] = {};
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 8b1a8dc3e5af7..898c6c17d2062 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -218,7 +218,7 @@ void LinkerDriver::addFile(InputFile *file) {
                  << " linked in after "
                     "doing LTO compilation.";
       }
-      ctx.bitcodeFileInstances.push_back(f);
+      f->symtab.bitcodeFileInstances.push_back(f);
     } else if (auto *f = dyn_cast<ImportFile>(file)) {
       ctx.importFileInstances.push_back(f);
     }
@@ -285,7 +285,7 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
     addFile(make<ArchiveFile>(ctx, mbref));
     break;
   case file_magic::bitcode:
-    addFile(make<BitcodeFile>(ctx, mbref, "", 0, lazy));
+    addFile(BitcodeFile::create(ctx, mbref, "", 0, lazy));
     break;
   case file_magic::coff_object:
   case file_magic::coff_import_library:
@@ -374,8 +374,8 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   if (magic == file_magic::coff_object) {
     obj = ObjFile::create(ctx, mb);
   } else if (magic == file_magic::bitcode) {
-    obj =
-        make<BitcodeFile>(ctx, mb, parentName, offsetInArchive, /*lazy=*/false);
+    obj = BitcodeFile::create(ctx, mb, parentName, offsetInArchive,
+                              /*lazy=*/false);
   } else if (magic == file_magic::coff_cl_gl_object) {
     Err(ctx) << mb.getBufferIdentifier()
              << ": is not a native COFF file. Recompile without /GL?";
@@ -2571,19 +2571,19 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         }
       }
 
-      // If any inputs are bitcode files, the LTO code generator may create
-      // references to library functions that are not explicit in the bitcode
-      // file's symbol table. If any of those library functions are defined in a
-      // bitcode file in an archive member, we need to arrange to use LTO to
-      // compile those archive members by adding them to the link beforehand.
-      if (!ctx.bitcodeFileInstances.empty()) {
-        llvm::Triple TT(
-            ctx.bitcodeFileInstances.front()->obj->getTargetTriple());
-        for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
-          ctx.symtab.addLibcall(s);
-      }
-
       ctx.forEachSymtab([&](SymbolTable &symtab) {
+        // If any inputs are bitcode files, the LTO code generator may create
+        // references to library functions that are not explicit in the bitcode
+        // file's symbol table. If any of those library functions are defined in
+        // a bitcode file in an archive member, we need to arrange to use LTO to
+        // compile those archive members by adding them to the link beforehand.
+        if (!symtab.bitcodeFileInstances.empty()) {
+          llvm::Triple TT(
+              symtab.bitcodeFileInstances.front()->obj->getTargetTriple());
+          for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT))
+            symtab.addLibcall(s);
+        }
+
         // Windows specific -- if __load_config_used can be resolved, resolve
         // it.
         if (symtab.findUnderscore("_load_config_used"))
@@ -2639,8 +2639,11 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // If we are going to do codegen for link-time optimization, check for
   // unresolvable symbols first, so we don't spend time generating code that
   // will fail to link anyway.
-  if (!ctx.bitcodeFileInstances.empty() && !config->forceUnresolved)
-    ctx.symtab.reportUnresolvable();
+  if (!config->forceUnresolved)
+    ctx.forEachSymtab([](SymbolTable &symtab) {
+      if (!symtab.bitcodeFileInstances.empty())
+        symtab.reportUnresolvable();
+    });
   if (errorCount())
     return;
 
@@ -2655,7 +2658,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // link those files (unless -thinlto-index-only was given, in which case we
   // resolve symbols and write indices, but don't generate native code or link).
   ltoCompilationDone = true;
-  ctx.symtab.compileBitcodeFiles();
+  ctx.forEachSymtab([](SymbolTable &symtab) { symtab.compileBitcodeFiles(); });
 
   if (Defined *d =
           dyn_cast_or_null<Defined>(ctx.symtab.findUnderscore("_tls_used")))
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 66641ff9dcc1f..5ee73d4dc4f8b 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1229,10 +1229,15 @@ void ImportFile::parse() {
   }
 }
 
-BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
-                         StringRef archiveName, uint64_t offsetInArchive,
-                         bool lazy)
-    : InputFile(ctx.symtab, BitcodeKind, mb, lazy) {
+BitcodeFile::BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb,
+                         std::unique_ptr<lto::InputFile> &o, bool lazy)
+    : InputFile(symtab, BitcodeKind, mb, lazy) {
+  obj.swap(o);
+}
+
+BitcodeFile *BitcodeFile::create(COFFLinkerContext &ctx, MemoryBufferRef mb,
+                                 StringRef archiveName,
+                                 uint64_t offsetInArchive, bool lazy) {
   std::string path = mb.getBufferIdentifier().str();
   if (ctx.config.thinLTOIndexOnly)
     path = replaceThinLTOSuffix(mb.getBufferIdentifier(),
@@ -1252,7 +1257,9 @@ BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
                                                sys::path::filename(path) +
                                                utostr(offsetInArchive)));
 
-  obj = check(lto::InputFile::create(mbref));
+  std::unique_ptr<lto::InputFile> obj = check(lto::InputFile::create(mbref));
+  return make<BitcodeFile>(ctx.getSymtab(getMachineType(obj.get())), mb, obj,
+                           lazy);
 }
 
 BitcodeFile::~BitcodeFile() = default;
@@ -1329,7 +1336,7 @@ void BitcodeFile::parseLazy() {
     }
 }
 
-MachineTypes BitcodeFile::getMachineType() const {
+MachineTypes BitcodeFile::getMachineType(const llvm::lto::InputFile *obj) {
   Triple t(obj->getTargetTriple());
   switch (t.getArch()) {
   case Triple::x86_64:
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index d3075c5e0a338..823561cda247a 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -386,13 +386,19 @@ class ImportFile : public InputFile {
 // Used for LTO.
 class BitcodeFile : public InputFile {
 public:
-  explicit BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb,
-                       StringRef archiveName, uint64_t offsetInArchive,
-                       bool lazy);
+  explicit BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb,
+                       std::unique_ptr<llvm::lto::InputFile> &obj, bool lazy);
   ~BitcodeFile();
+
+  static BitcodeFile *create(COFFLinkerContext &ctx, MemoryBufferRef mb,
+                             StringRef archiveName, uint64_t offsetInArchive,
+                             bool lazy);
   static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
   ArrayRef<Symbol *> getSymbols() { return symbols; }
-  MachineTypes getMachineType() const override;
+  MachineTypes getMachineType() const override {
+    return getMachineType(obj.get());
+  }
+  static MachineTypes getMachineType(const llvm::lto::InputFile *obj);
   void parseLazy();
   std::unique_ptr<llvm::lto::InputFile> obj;
 
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 36dcd0dfe1389..bf965e8a2332d 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -347,8 +347,8 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) {
 /// defined symbol imported" diagnostic for symbols in localImports.
 /// objFiles and bitcodeFiles (if not nullptr) are used to report where
 /// undefined symbols are referenced.
-static void reportProblemSymbols(
-    COFFLinkerContext &ctx, const SmallPtrSetImpl<Symbol *> &undefs,
+void SymbolTable::reportProblemSymbols(
+    const SmallPtrSetImpl<Symbol *> &undefs,
     const DenseMap<Symbol *, Symbol *> *localImports, bool needBitcodeFiles) {
   // Return early if there is nothing to report (which should be
   // the common case).
@@ -392,7 +392,7 @@ static void reportProblemSymbols(
     processFile(file, file->getSymbols());
 
   if (needBitcodeFiles)
-    for (BitcodeFile *file : ctx.bitcodeFileInstances)
+    for (BitcodeFile *file : bitcodeFileInstances)
       processFile(file, file->getSymbols());
 
   for (const UndefinedDiag &undefDiag : undefDiags)
@@ -423,8 +423,7 @@ void SymbolTable::reportUnresolvable() {
     undefs.insert(sym);
   }
 
-  reportProblemSymbols(ctx, undefs,
-                       /* localImports */ nullptr, true);
+  reportProblemSymbols(undefs, /*localImports=*/nullptr, true);
 }
 
 bool SymbolTable::resolveRemainingUndefines() {
@@ -506,8 +505,8 @@ bool SymbolTable::resolveRemainingUndefines() {
   }
 
   reportProblemSymbols(
-      ctx, undefs,
-      ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, false);
+      undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr,
+      false);
   return foundLazy;
 }
 
@@ -1124,13 +1123,13 @@ Symbol *SymbolTable::addUndefined(StringRef name) {
 }
 
 void SymbolTable::compileBitcodeFiles() {
-  if (ctx.bitcodeFileInstances.empty())
+  if (bitcodeFileInstances.empty())
     return;
 
   llvm::TimeTraceScope timeScope("Compile bitcode");
   ScopedTimer t(ctx.ltoTimer);
   lto.reset(new BitcodeCompiler(ctx));
-  for (BitcodeFile *f : ctx.bitcodeFileInstances)
+  for (BitcodeFile *f : bitcodeFileInstances)
     lto->add(*f);
   for (InputFile *newObj : lto->compile()) {
     ObjFile *obj = cast<ObjFile>(newObj);
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 9e316fcdbe630..66bca0d63e5ff 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -14,6 +14,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -155,6 +156,8 @@ class SymbolTable {
       callback(pair.second);
   }
 
+  std::vector<BitcodeFile *> bitcodeFileInstances;
+
   DefinedRegular *loadConfigSym = nullptr;
   uint32_t loadConfigSize = 0;
   void initializeLoadConfig();
@@ -175,6 +178,11 @@ class SymbolTable {
   std::unique_ptr<BitcodeCompiler> lto;
   std::vector<std::pair<Symbol *, Symbol *>> entryThunks;
   llvm::DenseMap<Symbol *, Symbol *> exitThunks;
+
+  void
+  reportProblemSymbols(const llvm::SmallPtrSetImpl<Symbol *> &undefs,
+                       const llvm::DenseMap<Symbol *, Symbol *> *localImports,
+                       bool needBitcodeFiles);
 };
 
 std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex);
diff --git a/lld/test/COFF/lto-arm64x.ll b/lld/test/COFF/lto-arm64x.ll
new file mode 100644
index 0000000000000..bbfc6b64c6fce
--- /dev/null
+++ b/lld/test/COFF/lto-arm64x.ll
@@ -0,0 +1,47 @@
+; REQUIRES: aarch64, x86
+; RUN: split-file %s %t.dir && cd %t.dir
+
+; RUN: llvm-as arm64ec.ll -o arm64ec.obj
+; RUN: llvm-as aarch64.ll -o aarch64.obj
+; RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+; RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+
+; RUN: lld-link -machine:arm64x aarch64.obj arm64ec.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -out:out.exe -subsystem:console
+; RUN: llvm-objdump -d out.exe | FileCheck %s
+
+; CHECK:      0000000140001000 <.text>:
+; CHECK-NEXT: 140001000: 52800020     mov     w0, #0x1                // =1
+; CHECK-NEXT: 140001004: d65f03c0     ret
+; CHECK-NEXT:                 ...
+; CHECK-NEXT: 140002000: 00000009     udf     #0x9
+; CHECK-NEXT: 140002004: 52800040     mov     w0, #0x2                // =2
+; CHECK-NEXT: 140002008: d65f03c0     ret
+
+; CHECK:      0000000140003000 <.hexpthk>:
+; CHECK-NEXT: 140003000: 48 8b c4                     movq    %rsp, %rax
+; CHECK-NEXT: 140003003: 48 89 58 20                  movq    %rbx, 0x20(%rax)
+; CHECK-NEXT: 140003007: 55                           pushq   %rbp
+; CHECK-NEXT: 140003008: 5d                           popq    %rbp
+; CHECK-NEXT: 140003009: e9 f6 ef ff ff               jmp     0x140002004 <.text+0x1004>
+; CHECK-NEXT: 14000300e: cc                           int3
+; CHECK-NEXT: 14000300f: cc                           int3
+
+#--- arm64ec.ll
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64ec-unknown-windows-msvc"
+
+define dso_local i32 @mainCRTStartup() {
+entry:
+  ret i32 2
+}
+
+#--- aarch64.ll
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-windows-msvc"
+
+define dso_local i32 @mainCRTStartup() {
+entry:
+  ret i32 1
+}

From 101109fc5460d5bb9bb597c6ec77f998093a6687 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 17 Jan 2025 11:40:34 +0100
Subject: [PATCH 24/45] [MLIR] Add missing include (NFC)

Needed for libstdc++ 15 compatibility.
---
 mlir/include/mlir/Target/SPIRV/Deserialization.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/include/mlir/Target/SPIRV/Deserialization.h b/mlir/include/mlir/Target/SPIRV/Deserialization.h
index e39258beeaac8..a346a7fd1e5f7 100644
--- a/mlir/include/mlir/Target/SPIRV/Deserialization.h
+++ b/mlir/include/mlir/Target/SPIRV/Deserialization.h
@@ -15,6 +15,7 @@
 
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/LLVM.h"
+#include <cstdint>
 
 namespace mlir {
 class MLIRContext;

From 831527a5ef63d24d056afc92509caf5ceb1d3682 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Fri, 17 Jan 2025 10:49:43 +0000
Subject: [PATCH 25/45] [FMV][GlobalOpt] Statically resolve calls to versioned
 functions. (#87939)

To deduce whether the optimization is legal we need to compare the target
features between caller and callee versions. The criteria for bypassing
the resolver are the following:

 * If the callee's feature set is a subset of the caller's feature set,
   then the callee is a candidate for direct call.

 * Among such candidates the one of highest priority is the best match
   and it shall be picked, unless there is a version of the callee with
   higher priority than the best match which cannot be picked from a
   higher priority caller (directly or through the resolver).

 * For every higher priority callee version than the best match, there
   is a higher priority caller version whose feature set availability
   is implied by the callee's feature set.

Example:

Callers and Callees are ordered in decreasing priority.
The arrows indicate successful call redirections.

  Caller        Callee      Explanation
=========================================================================
mops+sve2 --+--> mops       all the callee versions are subsets of the
            |               caller but mops has the highest priority
            |
     mops --+    sve2       between mops and default callees, mops wins

      sve        sve        between sve and default callees, sve wins
                            but sve2 does not have a high priority caller

  default -----> default    sve (callee) implies sve (caller),
                            sve2(callee) implies sve (caller),
                            mops(callee) implies mops(caller)
---
 .../llvm/Analysis/TargetTransformInfo.h       |  17 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   4 +
 .../llvm/TargetParser/AArch64TargetParser.h   |  13 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   8 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |  14 +
 .../AArch64/AArch64TargetTransformInfo.h      |   4 +
 llvm/lib/TargetParser/AArch64TargetParser.cpp |  31 +-
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 162 ++++++++
 .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 365 ++++++++++++++++++
 9 files changed, 608 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index fe13fc676e303..71b204f9c3fec 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1870,6 +1870,13 @@ class TargetTransformInfo {
   /// false, but it shouldn't matter what it returns anyway.
   bool hasArmWideBranch(bool Thumb) const;
 
+  /// Returns a bitmask constructed from the target-features or fmv-features
+  /// metadata of a function.
+  uint64_t getFeatureMask(const Function &F) const;
+
+  /// Returns true if this is an instance of a function with multiple versions.
+  bool isMultiversionedFunction(const Function &F) const;
+
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
@@ -2312,6 +2319,8 @@ class TargetTransformInfo::Concept {
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
+  virtual uint64_t getFeatureMask(const Function &F) const = 0;
+  virtual bool isMultiversionedFunction(const Function &F) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
   virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
                                                Type *ArrayType) const = 0;
@@ -3144,6 +3153,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasArmWideBranch(Thumb);
   }
 
+  uint64_t getFeatureMask(const Function &F) const override {
+    return Impl.getFeatureMask(F);
+  }
+
+  bool isMultiversionedFunction(const Function &F) const override {
+    return Impl.isMultiversionedFunction(F);
+  }
+
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7ac3063ca9a37..dcef4a1abcfa3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1039,6 +1039,10 @@ class TargetTransformInfoImplBase {
 
   bool hasArmWideBranch(bool) const { return false; }
 
+  uint64_t getFeatureMask(const Function &F) const { return 0; }
+
+  bool isMultiversionedFunction(const Function &F) const { return false; }
+
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 63f06a3a69298..0338770593bc4 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -270,13 +270,16 @@ void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
 bool isX18ReservedByDefault(const Triple &TT);
 
-// Return the priority for a given set of FMV features.
+// For a given set of feature names, which can be either target-features, or
+// fmv-features metadata, expand their dependencies and then return a bitmask
+// corresponding to the entries of AArch64::FeatPriorities.
 uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
-// For given feature names, return a bitmask corresponding to the entries of
-// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves,
-// they are sequential (0, 1, 2, 3, ...). The resulting bitmask is used at
-// runtime to test whether a certain FMV feature is available on the host.
+// For a given set of FMV feature names, expand their dependencies and then
+// return a bitmask corresponding to the entries of AArch64::CPUFeatures.
+// The values in CPUFeatures are not bitmasks themselves, they are sequential
+// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether
+// a certain FMV feature is available on the host.
 uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
 
 void PrintSupportedExtensions();
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index df42dc2746daf..8b9722d047edc 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1383,6 +1383,14 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const {
   return TTIImpl->hasArmWideBranch(Thumb);
 }
 
+uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const {
+  return TTIImpl->getFeatureMask(F);
+}
+
+bool TargetTransformInfo::isMultiversionedFunction(const Function &F) const {
+  return TTIImpl->isMultiversionedFunction(F);
+}
+
 unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 932a6f9ce23fd..7f10bfed739b4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
@@ -248,6 +249,19 @@ static bool hasPossibleIncompatibleOps(const Function *F) {
   return false;
 }
 
+uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
+  StringRef AttributeStr =
+      isMultiversionedFunction(F) ? "fmv-features" : "target-features";
+  StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
+  SmallVector<StringRef, 8> Features;
+  FeatureStr.split(Features, ",");
+  return AArch64::getFMVPriority(Features);
+}
+
+bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
+  return F.hasFnAttribute("fmv-features");
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 8e7e590c173ff..1eb805ae00b1b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -89,6 +89,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
                                 unsigned DefaultCallPenalty) const;
 
+  uint64_t getFeatureMask(const Function &F) const;
+
+  bool isMultiversionedFunction(const Function &F) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index 34ca03a47e0a4..e13c6e6d28c2b 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -48,12 +48,33 @@ std::optional<AArch64::ArchInfo> AArch64::ArchInfo::findBySubArch(StringRef SubA
   return {};
 }
 
+std::optional<AArch64::FMVInfo> lookupFMVByID(AArch64::ArchExtKind ExtID) {
+  for (const AArch64::FMVInfo &Info : AArch64::getFMVInfo())
+    if (Info.ID && *Info.ID == ExtID)
+      return Info;
+  return {};
+}
+
 uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
-  uint64_t Priority = 0;
-  for (StringRef Feature : Features)
-    if (std::optional<FMVInfo> Info = parseFMVExtension(Feature))
-      Priority |= (1ULL << Info->PriorityBit);
-  return Priority;
+  // Transitively enable the Arch Extensions which correspond to each feature.
+  ExtensionSet FeatureBits;
+  for (const StringRef Feature : Features) {
+    std::optional<FMVInfo> FMV = parseFMVExtension(Feature);
+    if (!FMV) {
+      if (std::optional<ExtensionInfo> Info = targetFeatureToExtension(Feature))
+        FMV = lookupFMVByID(Info->ID);
+    }
+    if (FMV && FMV->ID)
+      FeatureBits.enable(*FMV->ID);
+  }
+
+  // Construct a bitmask for all the transitively enabled Arch Extensions.
+  uint64_t PriorityMask = 0;
+  for (const FMVInfo &Info : getFMVInfo())
+    if (Info.ID && FeatureBits.Enabled.test(*Info.ID))
+      PriorityMask |= (1ULL << Info.PriorityBit);
+
+  return PriorityMask;
 }
 
 uint64_t AArch64::getCpuSupportsMask(ArrayRef<StringRef> Features) {
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 78cd249c9c16a..bf0cacc6224be 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2641,6 +2641,165 @@ DeleteDeadIFuncs(Module &M,
   return Changed;
 }
 
+// Follows the use-def chain of \p V backwards until it finds a Function,
+// in which case it collects in \p Versions. Return true on successful
+// use-def chain traversal, false otherwise.
+static bool collectVersions(TargetTransformInfo &TTI, Value *V,
+                            SmallVectorImpl<Function *> &Versions) {
+  if (auto *F = dyn_cast<Function>(V)) {
+    if (!TTI.isMultiversionedFunction(*F))
+      return false;
+    Versions.push_back(F);
+  } else if (auto *Sel = dyn_cast<SelectInst>(V)) {
+    if (!collectVersions(TTI, Sel->getTrueValue(), Versions))
+      return false;
+    if (!collectVersions(TTI, Sel->getFalseValue(), Versions))
+      return false;
+  } else if (auto *Phi = dyn_cast<PHINode>(V)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+      if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions))
+        return false;
+  } else {
+    // Unknown instruction type. Bail.
+    return false;
+  }
+  return true;
+}
+
+// Bypass the IFunc Resolver of MultiVersioned functions when possible. To
+// deduce whether the optimization is legal we need to compare the target
+// features between caller and callee versions. The criteria for bypassing
+// the resolver are the following:
+//
+// * If the callee's feature set is a subset of the caller's feature set,
+//   then the callee is a candidate for direct call.
+//
+// * Among such candidates the one of highest priority is the best match
+//   and it shall be picked, unless there is a version of the callee with
+//   higher priority than the best match which cannot be picked from a
+//   higher priority caller (directly or through the resolver).
+//
+// * For every higher priority callee version than the best match, there
+//   is a higher priority caller version whose feature set availability
+//   is implied by the callee's feature set.
+//
+static bool OptimizeNonTrivialIFuncs(
+    Module &M, function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+  bool Changed = false;
+
+  // Cache containing the mask constructed from a function's target features.
+  DenseMap<Function *, uint64_t> FeatureMask;
+
+  for (GlobalIFunc &IF : M.ifuncs()) {
+    if (IF.isInterposable())
+      continue;
+
+    Function *Resolver = IF.getResolverFunction();
+    if (!Resolver)
+      continue;
+
+    if (Resolver->isInterposable())
+      continue;
+
+    TargetTransformInfo &TTI = GetTTI(*Resolver);
+
+    // Discover the callee versions.
+    SmallVector<Function *> Callees;
+    if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) {
+          if (auto *Ret = dyn_cast_or_null<ReturnInst>(BB.getTerminator()))
+            if (!collectVersions(TTI, Ret->getReturnValue(), Callees))
+              return true;
+          return false;
+        }))
+      continue;
+
+    assert(!Callees.empty() && "Expecting successful collection of versions");
+
+    // Cache the feature mask for each callee.
+    for (Function *Callee : Callees) {
+      auto [It, Inserted] = FeatureMask.try_emplace(Callee);
+      if (Inserted)
+        It->second = TTI.getFeatureMask(*Callee);
+    }
+
+    // Sort the callee versions in decreasing priority order.
+    sort(Callees, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    // Find the callsites and cache the feature mask for each caller.
+    SmallVector<Function *> Callers;
+    DenseMap<Function *, SmallVector<CallBase *>> CallSites;
+    for (User *U : IF.users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        if (CB->getCalledOperand() == &IF) {
+          Function *Caller = CB->getFunction();
+          auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller);
+          if (FeatInserted)
+            FeatIt->second = TTI.getFeatureMask(*Caller);
+          auto [CallIt, CallInserted] = CallSites.try_emplace(Caller);
+          if (CallInserted)
+            Callers.push_back(Caller);
+          CallIt->second.push_back(CB);
+        }
+      }
+    }
+
+    // Sort the caller versions in decreasing priority order.
+    sort(Callers, [&](auto *LHS, auto *RHS) {
+      return FeatureMask[LHS] > FeatureMask[RHS];
+    });
+
+    auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; };
+
+    // Index to the highest priority candidate.
+    unsigned I = 0;
+    // Now try to redirect calls starting from higher priority callers.
+    for (Function *Caller : Callers) {
+      assert(I < Callees.size() && "Found callers of equal priority");
+
+      Function *Callee = Callees[I];
+      uint64_t CallerBits = FeatureMask[Caller];
+      uint64_t CalleeBits = FeatureMask[Callee];
+
+      // In the case of FMV callers, we know that all higher priority callers
+      // than the current one did not get selected at runtime, which helps
+      // reason about the callees (if they have versions that mandate presence
+      // of the features which we already know are unavailable on this target).
+      if (TTI.isMultiversionedFunction(*Caller)) {
+        // If the feature set of the caller implies the feature set of the
+        // highest priority candidate then it shall be picked. In case of
+        // identical sets advance the candidate index one position.
+        if (CallerBits == CalleeBits)
+          ++I;
+        else if (!implies(CallerBits, CalleeBits)) {
+          // Keep advancing the candidate index as long as the caller's
+          // features are a subset of the current candidate's.
+          while (implies(CalleeBits, CallerBits)) {
+            if (++I == Callees.size())
+              break;
+            CalleeBits = FeatureMask[Callees[I]];
+          }
+          continue;
+        }
+      } else {
+        // We can't reason much about non-FMV callers. Just pick the highest
+        // priority callee if it matches, otherwise bail.
+        if (I > 0 || !implies(CallerBits, CalleeBits))
+          continue;
+      }
+      auto &Calls = CallSites[Caller];
+      for (CallBase *CS : Calls)
+        CS->setCalledOperand(Callee);
+      Changed = true;
+    }
+    if (IF.use_empty() ||
+        all_of(IF.users(), [](User *U) { return isa<GlobalAlias>(U); }))
+      NumIFuncsResolved++;
+  }
+  return Changed;
+}
+
 static bool
 optimizeGlobalsInModule(Module &M, const DataLayout &DL,
                         function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2707,6 +2866,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
     // Optimize IFuncs whose callee's are statically known.
     LocalChange |= OptimizeStaticIFuncs(M);
 
+    // Optimize IFuncs based on the target features of the caller.
+    LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI);
+
     // Remove any IFuncs that are now dead.
     LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats);
 
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
new file mode 100644
index 0000000000000..90bd98a9b0d38
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -0,0 +1,365 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$test_single_bb_resolver.resolver = comdat any
+$test_multi_bb_resolver.resolver = comdat any
+$test_caller_feats_not_implied.resolver = comdat any
+$test_non_fmv_caller.resolver = comdat any
+$test_priority.resolver = comdat any
+$test_alternative_names.resolver = comdat any
+
+@__aarch64_cpu_features = external local_unnamed_addr global { i64 }
+
+@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver
+@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver
+@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver
+@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver
+@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver
+@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver
+
+declare void @__init_cpu_features_resolver() local_unnamed_addr
+
+declare i32 @test_single_bb_resolver.default() #0
+declare i32 @test_single_bb_resolver._Msve() #1
+declare i32 @test_single_bb_resolver._Msve2() #2
+
+define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 68719476736
+  %.not = icmp eq i64 %1, 0
+  %2 = and i64 %0, 1073741824
+  %.not3 = icmp eq i64 %2, 0
+  %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve
+  %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2
+  ret ptr %common.ret.op
+}
+
+define i32 @caller1._Msve() #1 {
+; CHECK-LABEL: define i32 @caller1._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller1._Msve2() #2 {
+; CHECK-LABEL: define i32 @caller1._Msve2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller1.default() #0 {
+; CHECK-LABEL: define i32 @caller1.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_single_bb_resolver()
+  ret i32 %call
+}
+
+declare i32 @test_multi_bb_resolver._Mmops() #3
+declare i32 @test_multi_bb_resolver._Msve2() #2
+declare i32 @test_multi_bb_resolver._Msve() #1
+declare i32 @test_multi_bb_resolver.default() #0
+
+define weak_odr ptr @test_multi_bb_resolver.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 68719476736
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve
+  br label %common.ret
+}
+
+define i32 @caller2._MmopsMsve2() #4 {
+; CHECK-LABEL: define i32 @caller2._MmopsMsve2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller2._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2._Msve() #1 {
+; CHECK-LABEL: define i32 @caller2._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+define i32 @caller2.default() #0 {
+; CHECK-LABEL: define i32 @caller2.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default()
+;
+entry:
+  %call = tail call i32 @test_multi_bb_resolver()
+  ret i32 %call
+}
+
+declare i32 @test_caller_feats_not_implied._Mmops() #3
+declare i32 @test_caller_feats_not_implied._Msme() #5
+declare i32 @test_caller_feats_not_implied._Msve() #1
+declare i32 @test_caller_feats_not_implied.default() #0
+
+define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 576460752303423488
+  %.not = icmp eq i64 %1, 0
+  br i1 %.not, label %resolver_else, label %common.ret
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %2 = and i64 %0, 4398046511104
+  %.not5 = icmp eq i64 %2, 0
+  br i1 %.not5, label %resolver_else2, label %common.ret
+
+resolver_else2:                                   ; preds = %resolver_else
+  %3 = and i64 %0, 1073741824
+  %.not6 = icmp eq i64 %3, 0
+  %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve
+  br label %common.ret
+}
+
+define i32 @caller3._Mmops() #3 {
+; CHECK-LABEL: define i32 @caller3._Mmops(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @caller3._Msve() #1 {
+; CHECK-LABEL: define i32 @caller3._Msve(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+define i32 @caller3.default() #0 {
+; CHECK-LABEL: define i32 @caller3.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied()
+;
+entry:
+  %call = tail call i32 @test_caller_feats_not_implied()
+  ret i32 %call
+}
+
+declare i32 @test_non_fmv_caller._Maes() #6
+declare i32 @test_non_fmv_caller._Msm4() #7
+declare i32 @test_non_fmv_caller.default() #0
+
+define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 32768
+  %.not = icmp eq i64 %1, 0
+  %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes
+  ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default
+}
+
+define i32 @caller4() #8 {
+; CHECK-LABEL: define i32 @caller4(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
+define i32 @caller5() #9 {
+; CHECK-LABEL: define i32 @caller5(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller()
+;
+entry:
+  %call = tail call i32 @test_non_fmv_caller()
+  ret i32 %call
+}
+
+declare i32 @test_priority._Msve2-sha3() #10
+declare i32 @test_priority._Mls64Mssbs() #11
+declare i32 @test_priority._MflagmMlseMrng() #12
+declare i32 @test_priority.default() #0
+
+define weak_odr ptr @test_priority.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_priority.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 131
+  %2 = icmp eq i64 %1, 131
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 9570149208162304
+  %4 = icmp eq i64 %3, 9570149208162304
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 1099511627776
+  %.not = icmp eq i64 %5, 0
+  %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3
+  br label %common.ret
+}
+
+define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 {
+; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs()
+;
+entry:
+  %call = tail call i32 @test_priority()
+  ret i32 %call
+}
+
+declare i32 @test_alternative_names._Mdpb2Mfrintts() #14
+declare i32 @test_alternative_names._Mflagm2Mfrintts() #15
+declare i32 @test_alternative_names._Mrcpc2() #16
+declare i32 @test_alternative_names.default() #0
+
+define weak_odr ptr @test_alternative_names.resolver() comdat {
+; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat {
+resolver_entry:
+  tail call void @__init_cpu_features_resolver()
+  %0 = load i64, ptr @__aarch64_cpu_features, align 8
+  %1 = and i64 %0, 17563904
+  %2 = icmp eq i64 %1, 17563904
+  br i1 %2, label %common.ret, label %resolver_else
+
+common.ret:                                       ; preds = %resolver_else2, %resolver_else, %resolver_entry
+  %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ]
+  ret ptr %common.ret.op
+
+resolver_else:                                    ; preds = %resolver_entry
+  %3 = and i64 %0, 16777478
+  %4 = icmp eq i64 %3, 16777478
+  br i1 %4, label %common.ret, label %resolver_else2
+
+resolver_else2:                                   ; preds = %resolver_else
+  %5 = and i64 %0, 12582912
+  %6 = icmp eq i64 %5, 12582912
+  %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default
+  br label %common.ret
+}
+
+define i32 @caller7._Mdpb2Mfrintts() #14 {
+; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7._Mfrintts() #17 {
+; CHECK-LABEL: define i32 @caller7._Mfrintts(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7._Mrcpc2() #16 {
+; CHECK-LABEL: define i32 @caller7._Mrcpc2(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+define i32 @caller7.default() #0 {
+; CHECK-LABEL: define i32 @caller7.default(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_alternative_names.default()
+;
+entry:
+  %call = tail call i32 @test_alternative_names()
+  ret i32 %call
+}
+
+attributes #0 = { "fmv-features" }
+attributes #1 = { "fmv-features"="sve" }
+attributes #2 = { "fmv-features"="sve2" }
+attributes #3 = { "fmv-features"="mops" }
+attributes #4 = { "fmv-features"="mops,sve2" }
+attributes #5 = { "fmv-features"="sme" }
+attributes #6 = { "fmv-features"="aes" }
+attributes #7 = { "fmv-features"="sm4" }
+attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" }
+attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" }
+attributes #10 = { "fmv-features"="sve2-sha3" }
+attributes #11 = { "fmv-features"="ls64,ssbs" }
+attributes #12 = { "fmv-features"="flagm,lse,rng" }
+attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" }
+attributes #14 = { "fmv-features"="dpb2,frintts" }
+attributes #15 = { "fmv-features"="flagm2,frintts" }
+attributes #16 = { "fmv-features"="rcpc2" }
+attributes #17 = { "fmv-features"="frintts" }

From ad282f4c1fdcb6e03914d9dab4f85fad5b16e864 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 17 Jan 2025 10:50:05 +0000
Subject: [PATCH 26/45] [X86] Rename combineScalarToVector to
 combineSCALAR_TO_VECTOR. NFC.

Match the file style of using the ISD NodeType name for the combine/lower method name.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6d69665c17565..de5bb08ae3a39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58572,8 +58572,8 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
+static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   SDLoc DL(N);
@@ -59266,7 +59266,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   // clang-format off
   default: break;
   case ISD::SCALAR_TO_VECTOR:
-    return combineScalarToVector(N, DAG, Subtarget);
+    return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:

From 0ab368c5735328298d99dcfb80da12e7be028583 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 17 Jan 2025 10:54:39 +0000
Subject: [PATCH 27/45] SCEV/test: cover implied-via-addition (#123082)

Since cf2e828 (SCEV: regen some tests with UTC) had the side-effect of
moving an implied-via-addition test into IndVarSimplify, implication via
addition is no longer covered in the SCEV tests. Fix this by writing
fresh tests and checking backedge-taken output from SCEV.
---
 .../ScalarEvolution/implied-via-addition.ll   | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll

diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll
new file mode 100644
index 0000000000000..7ab6221d0da53
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -disable-output -passes="print<scalar-evolution>" \
+; RUN:   -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s
+
+define void @implied1(i32 %n) {
+; Prove that (n s> 1) ===> (n - 1 s> 0).
+; CHECK-LABEL: 'implied1'
+; CHECK-NEXT:  Determining loop execution counts for: @implied1
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-2 + %n)
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483645
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-2 + %n)
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  %n.minus.1 = sub nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied1_neg(i32 %n) {
+; Prove that (n s> 0) =\=> (n - 1 s> 0).
+; CHECK-LABEL: 'implied1_neg'
+; CHECK-NEXT:  Determining loop execution counts for: @implied1_neg
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-1 + (1 smax (-1 + %n)<nsw>))<nsw>
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483645
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (-1 + %n)<nsw>))<nsw>
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  %n.minus.1 = sub nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied2(i32 %n) {
+; Prove that (n u>= -1) ===> (n + 1 u>= 0).
+; CHECK-LABEL: 'implied2'
+; CHECK-NEXT:  Determining loop execution counts for: @implied2
+; CHECK-NEXT:  Loop %header: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %header: Predicated backedge-taken count is (1 + (zext i32 %n to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %header: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %header: Predicated symbolic max backedge-taken count is (1 + (zext i32 %n to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {1,+,1}<%header> Added Flags: <nusw>
+;
+entry:
+  %cmp1 = icmp uge i32 %n, -1
+  %n.1 = add nuw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp uge i32 %n.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @implied2_neg(i32 %n) {
+; Prove that (n u>= -1) =\=> (n - 1 s>= 0).
+; CHECK-LABEL: 'implied2_neg'
+; CHECK-NEXT:  Determining loop execution counts for: @implied2_neg
+; CHECK-NEXT:  Loop %header: backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %header: constant max backedge-taken count is i32 2147483646
+; CHECK-NEXT:  Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n))<nsw>
+; CHECK-NEXT:  Loop %header: Trip multiple is 1
+;
+entry:
+  %cmp1 = icmp uge i32 %n, -1
+  %n.minus.1 = sub nuw nsw i32 %n, 1
+  call void @llvm.assume(i1 %cmp1)
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.minus.1, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}

From 437834e16be6d04e7b198dad8a42d507770251a1 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 17 Jan 2025 10:55:28 +0000
Subject: [PATCH 28/45] [Flang] Use a module directory to avoid race condition
 (#123215)

Use a module directory in a test that uses another fortran test to avoid
race conditions in module creation.
---
 flang/test/Lower/module_use.f90 | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/test/Lower/module_use.f90 b/flang/test/Lower/module_use.f90
index ad43865470b68..b976663239ef5 100644
--- a/flang/test/Lower/module_use.f90
+++ b/flang/test/Lower/module_use.f90
@@ -1,5 +1,6 @@
-! RUN: bbc -emit-fir %S/module_definition.f90
-! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: rm -fr %t && mkdir -p %t
+! RUN: bbc -emit-fir -module %t %S/module_definition.f90
+! RUN: bbc -emit-fir -J %t %s -o - | FileCheck %s
 
 ! Test use of module data not defined in this file.
 ! The modules are defined in module_definition.f90

From 21704a685de5f241acddf462e5f9b38d132cfcaa Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 17 Jan 2025 03:00:02 -0800
Subject: [PATCH 29/45] [AMDGPU] Fix printing hasInitWholeWave in mir (#123232)

---
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.cpp |  2 +-
 llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 169f1369fb543..7de64bddf7884 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -715,7 +715,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
       PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
       MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
-      Mode(MFI.getMode()) {
+      Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) {
   for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
     SpillPhysVGPRS.push_back(regToString(Reg, TRI));
 
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll
new file mode 100644
index 0000000000000..f3b8deff61918
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll
@@ -0,0 +1,17 @@
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: name: init_wwm
+; GCN: hasInitWholeWave: true
+define void @init_wwm(ptr addrspace(1) inreg %p) {
+entry:
+  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %entry_exec, label %bb.1, label %bb.2
+
+bb.1:
+  store i32 1, ptr addrspace(1) %p
+  br label %bb.2
+
+bb.2:
+  ret void
+}

From 0d7c8c0e294d23fcfc9a396dafebe1465c471035 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 17 Jan 2025 12:07:52 +0100
Subject: [PATCH 30/45] [bazel] Add new file added in
 437834e16be6d04e7b198dad8a42d507770251a1

---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index f72babb646a85..2286d4cd35e08 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -700,6 +700,7 @@ cc_library(
     includes = ["include"],
     textual_hdrs = [
         # keep sorted
+        "include/clang/Basic/AllDiagnosticKinds.inc",
         "include/clang/Basic/AttrHasAttributeImpl.inc",
         "include/clang/Basic/AttrList.inc",
         "include/clang/Basic/AttrSubMatchRulesList.inc",

From f66a5e220cbc2650a5843db854d0734d2aaa030f Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 17 Jan 2025 12:13:30 +0100
Subject: [PATCH 31/45] [lldb] Fix SBThread::StepOverUntil for discontinuous
 functions (#123046)

I think the only issue here was that we would erroneously consider
functions which are "in the middle" of the function were stepping to as
a part of the function, and would try to step into them (likely stepping
out of the function instead) instead of giving up early.
---
 lldb/include/lldb/Symbol/Function.h           |   5 +
 lldb/source/API/SBThread.cpp                  |   5 +-
 .../thread/step_until/TestStepUntilAPI.py     | 133 ++++++++++++++++++
 .../thread/step_until/function.list           |   1 +
 .../functionalities/thread/step_until/main.c  |   3 +
 .../thread/step_until/symbol.order            |   9 ++
 6 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
 create mode 100644 lldb/test/API/functionalities/thread/step_until/function.list
 create mode 100644 lldb/test/API/functionalities/thread/step_until/symbol.order

diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h
index 157c007bdf0e8..d0b27269568b0 100644
--- a/lldb/include/lldb/Symbol/Function.h
+++ b/lldb/include/lldb/Symbol/Function.h
@@ -454,6 +454,11 @@ class Function : public UserID, public SymbolContextScope {
   /// and variables).
   const Address &GetAddress() const { return m_address; }
 
+  bool GetRangeContainingLoadAddress(lldb::addr_t load_addr, Target &target,
+                                     AddressRange &range) {
+    return m_block.GetRangeContainingLoadAddress(load_addr, target, range);
+  }
+
   lldb::LanguageType GetLanguage() const;
   /// Find the file and line number of the source location of the start of the
   /// function.  This will use the declaration if present and fall back on the
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 4e61c83889b0b..cc848076dab5f 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -842,7 +842,6 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
     // appropriate error message.
 
     bool all_in_function = true;
-    AddressRange fun_range = frame_sc.function->GetAddressRange();
 
     std::vector<addr_t> step_over_until_addrs;
     const bool abort_other_plans = false;
@@ -859,7 +858,9 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
       addr_t step_addr =
           sc.line_entry.range.GetBaseAddress().GetLoadAddress(target);
       if (step_addr != LLDB_INVALID_ADDRESS) {
-        if (fun_range.ContainsLoadAddress(step_addr, target))
+        AddressRange unused_range;
+        if (frame_sc.function->GetRangeContainingLoadAddress(step_addr, *target,
+                                                             unused_range))
           step_over_until_addrs.push_back(step_addr);
         else
           all_in_function = false;
diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
new file mode 100644
index 0000000000000..de3892ed278f8
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
@@ -0,0 +1,133 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestStepUntilAPI(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def setUp(self):
+        super().setUp()
+
+        self.main_source = "main.c"
+        self.main_spec = lldb.SBFileSpec(self.main_source)
+        self.less_than_two = line_number("main.c", "Less than 2")
+        self.greater_than_two = line_number("main.c", "Greater than or equal to 2.")
+        self.back_out_in_main = line_number("main.c", "Back out in main")
+        self.in_foo = line_number("main.c", "In foo")
+
+    def _build_dict_for_discontinuity(self):
+        return dict(
+            CFLAGS_EXTRAS="-funique-basic-block-section-names "
+            + "-ffunction-sections -fbasic-block-sections=list="
+            + self.getSourcePath("function.list"),
+            LD_EXTRAS="-Wl,--script=" + self.getSourcePath("symbol.order"),
+        )
+
+    def _do_until(self, build_dict, args, until_line, expected_line):
+        self.build(dictionary=build_dict)
+        launch_info = lldb.SBLaunchInfo(args)
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec, launch_info
+        )
+
+        self.assertSuccess(
+            thread.StepOverUntil(self.frame(), self.main_spec, until_line)
+        )
+
+        self.runCmd("process status")
+
+        line = self.frame().GetLineEntry().GetLine()
+        self.assertEqual(
+            line, expected_line, "Did not get the expected stop line number"
+        )
+
+    def _assertDiscontinuity(self):
+        target = self.target()
+        foo = target.FindFunctions("foo")
+        self.assertEqual(len(foo), 1)
+        foo = foo[0]
+
+        call_me = self.target().FindFunctions("call_me")
+        self.assertEqual(len(call_me), 1)
+        call_me = call_me[0]
+
+        foo_addr = foo.function.GetStartAddress().GetLoadAddress(target)
+        found_before = False
+        found_after = False
+        for range in call_me.function.GetRanges():
+            addr = range.GetBaseAddress().GetLoadAddress(target)
+            if addr < foo_addr:
+                found_before = True
+            if addr > foo_addr:
+                found_after = True
+
+        self.assertTrue(
+            found_before and found_after,
+            "'foo' is not between 'call_me'" + str(foo) + str(call_me),
+        )
+
+    def test_hitting(self):
+        """Test SBThread.StepOverUntil - targeting a line and hitting it."""
+        self._do_until(None, None, self.less_than_two, self.less_than_two)
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    def test_hitting_discontinuous(self):
+        """Test SBThread.StepOverUntil - targeting a line and hitting it -- with
+        discontinuous functions"""
+        self._do_until(
+            self._build_dict_for_discontinuity(),
+            None,
+            self.less_than_two,
+            self.less_than_two,
+        )
+        self._assertDiscontinuity()
+
+    def test_missing(self):
+        """Test SBThread.StepOverUntil - targeting a line and missing it by stepping out to call site"""
+        self._do_until(
+            None, ["foo", "bar", "baz"], self.less_than_two, self.back_out_in_main
+        )
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    def test_missing_discontinuous(self):
+        """Test SBThread.StepOverUntil - targeting a line and missing it by
+        stepping out to call site -- with discontinuous functions"""
+        self._do_until(
+            self._build_dict_for_discontinuity(),
+            ["foo", "bar", "baz"],
+            self.less_than_two,
+            self.back_out_in_main,
+        )
+        self._assertDiscontinuity()
+
+    def test_bad_line(self):
+        """Test that we get an error if attempting to step outside the current
+        function"""
+        self.build()
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec
+        )
+        self.assertIn(
+            "step until target not in current function",
+            thread.StepOverUntil(
+                self.frame(), self.main_spec, self.in_foo
+            ).GetCString(),
+        )
+
+    @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    def test_bad_line_discontinuous(self):
+        """Test that we get an error if attempting to step outside the current
+        function -- and the function is discontinuous"""
+        self.build(dictionary=self._build_dict_for_discontinuity())
+        _, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "At the start", self.main_spec
+        )
+        self.assertIn(
+            "step until target not in current function",
+            thread.StepOverUntil(
+                self.frame(), self.main_spec, self.in_foo
+            ).GetCString(),
+        )
+        self._assertDiscontinuity()
diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list
new file mode 100644
index 0000000000000..5900fe8c35069
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/function.list
@@ -0,0 +1 @@
+!call_me
diff --git a/lldb/test/API/functionalities/thread/step_until/main.c b/lldb/test/API/functionalities/thread/step_until/main.c
index bb866079cf5f5..4c52308f030e9 100644
--- a/lldb/test/API/functionalities/thread/step_until/main.c
+++ b/lldb/test/API/functionalities/thread/step_until/main.c
@@ -4,6 +4,9 @@
  * unrelated to the program, just to achieve consistent
  * debug line tables, across platforms, that are not
  * dependent on compiler optimzations. */
+
+int foo(int x) { return x; /* In foo */ }
+
 int call_me(int argc) {
   printf ("At the start, argc: %d.\n", argc);
 
diff --git a/lldb/test/API/functionalities/thread/step_until/symbol.order b/lldb/test/API/functionalities/thread/step_until/symbol.order
new file mode 100644
index 0000000000000..dcc9607a4188f
--- /dev/null
+++ b/lldb/test/API/functionalities/thread/step_until/symbol.order
@@ -0,0 +1,9 @@
+SECTIONS {
+  .text.ordered : {
+    *(.text.call_me)
+    *(.text.foo)
+    *(.text.call_me.call_me.__part.1)
+    *(.text.call_me.call_me.__part.2)
+    *(.text.call_me.call_me.__part.3)
+  }
+} INSERT BEFORE .text;

From a8649067723a84d1b9320523aa63f639f7bf5dfa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 17 Jan 2025 11:55:22 +0000
Subject: [PATCH 32/45] [X86] Fix logical operator warnings. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index de5bb08ae3a39..dba38f3e1a0bc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26438,7 +26438,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       switch (CC) {
       case ISD::SETEQ: {
         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
-        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1
+        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
           break;
         // (ZF = 1 and PF = 0)
         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
@@ -26447,7 +26447,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       }
       case ISD::SETNE: {
         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
-        if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 0
+        if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
           break;
         // (ZF = 0 or PF = 1)
         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

From 7075eee6bd0d445aa3f58ace314f7d12756c3e38 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 17 Jan 2025 12:58:15 +0100
Subject: [PATCH 33/45] [clang][bytecode] Add InitLinkScope for toplevel Expr
 temporary (#123319)

---
 clang/lib/AST/ByteCode/Compiler.cpp |  1 +
 clang/test/AST/ByteCode/cxx20.cpp   | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 4bfb80589620c..6677119d09211 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -4247,6 +4247,7 @@ bool Compiler<Emitter>::visitExpr(const Expr *E, bool DestroyToplevelScope) {
   // For us, that means everything we don't
   // have a PrimType for.
   if (std::optional<unsigned> LocalOffset = this->allocateLocal(E)) {
+    InitLinkScope<Emitter> ILS(this, InitLink::Temp(*LocalOffset));
     if (!this->emitGetPtrLocal(*LocalOffset, E))
       return false;
 
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 268362ceff635..268226a7c143e 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -893,3 +893,18 @@ namespace VirtDtor {
 
   static_assert(test('C', 'B'));
 }
+
+namespace TemporaryInNTTP {
+  template<auto n> struct B { /* ... */ };
+  struct J1 {
+    J1 *self=this;
+  };
+  /// FIXME: The bytecode interpreter emits a different diagnostic here.
+  /// The current interpreter creates a fake MaterializeTemporaryExpr (see EvaluateAsConstantExpr)
+  /// which is later used as the LValueBase of the created APValue.
+  B<J1{}> j1;  // ref-error {{pointer to temporary object is not allowed in a template argument}} \
+               // expected-error {{non-type template argument is not a constant expression}} \
+               // expected-note {{pointer to temporary is not a constant expression}} \
+               // expected-note {{created here}}
+  B<2> j2; /// Ok.
+}

From 61f94ebc9ef39a47f393a0dca58335e39d961b07 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Fri, 17 Jan 2025 13:01:25 +0100
Subject: [PATCH 34/45] [NFC][Offload] Structure/Readability of CMake cache
 (#123328)

Preparing to add more config options and want to group them all from
most-common to project / component specific.
---
 offload/cmake/caches/AMDGPUBot.cmake | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index d72b620ae3080..69bef91b2ce49 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -1,17 +1,19 @@
-# This file is meant for test builds on one basic AMDGPU buildbot only.
+# This file is used across all AMDGPU-cmake builders
 
 # Install directory set to /tmp as this is a bot config
 set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "")
 
+# General settings
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+
 set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
 set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload" CACHE STRING "")
+
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
-set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
 set(LLVM_TARGETS_TO_BUILD "host;AMDGPU" CACHE STRING "")
+set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "")
 
 set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
-
-set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
-set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")

From 58fc8029e91bf56811444d4a37a8f517a43bdc11 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 17 Jan 2025 12:45:14 +0100
Subject: [PATCH 35/45] [lldb] Skip TestStepUntilAPI on !x86_64, !aarch64

The compiler does not support this feature on other architectures.
---
 .../API/functionalities/thread/step_until/TestStepUntilAPI.py  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
index de3892ed278f8..59e028acf014c 100644
--- a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
+++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py
@@ -73,6 +73,7 @@ def test_hitting(self):
         self._do_until(None, None, self.less_than_two, self.less_than_two)
 
     @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
     def test_hitting_discontinuous(self):
         """Test SBThread.StepOverUntil - targeting a line and hitting it -- with
         discontinuous functions"""
@@ -91,6 +92,7 @@ def test_missing(self):
         )
 
     @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
     def test_missing_discontinuous(self):
         """Test SBThread.StepOverUntil - targeting a line and missing it by
         stepping out to call site -- with discontinuous functions"""
@@ -117,6 +119,7 @@ def test_bad_line(self):
         )
 
     @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"])
+    @skipIf(archs=no_match(["x86_64", "aarch64"]))
     def test_bad_line_discontinuous(self):
         """Test that we get an error if attempting to step outside the current
         function -- and the function is discontinuous"""

From a90b5b1885cc9587d7d65edbe3e0d94c4e2f4459 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Fri, 17 Jan 2025 12:11:53 +0000
Subject: [PATCH 36/45] [libclc] Move degrees/radians to CLC library & optimize
 (#123222)

Missing half variants were also added.

The builtins are now consistently emitted in vector form (i.e., with a
splat of the literal to the appropriate vector size).
---
 libclc/clc/include/clc/common/clc_degrees.h  | 12 +++++
 libclc/clc/include/clc/common/clc_radians.h  | 12 +++++
 libclc/clc/lib/generic/SOURCES               |  2 +
 libclc/clc/lib/generic/common/clc_degrees.cl | 56 ++++++++++++++++++++
 libclc/clc/lib/generic/common/clc_radians.cl | 56 ++++++++++++++++++++
 libclc/clc/lib/spirv/SOURCES                 |  2 +
 libclc/clc/lib/spirv64/SOURCES               |  2 +
 libclc/generic/lib/common/degrees.cl         | 21 ++++----
 libclc/generic/lib/common/radians.cl         | 21 ++++----
 9 files changed, 160 insertions(+), 24 deletions(-)
 create mode 100644 libclc/clc/include/clc/common/clc_degrees.h
 create mode 100644 libclc/clc/include/clc/common/clc_radians.h
 create mode 100644 libclc/clc/lib/generic/common/clc_degrees.cl
 create mode 100644 libclc/clc/lib/generic/common/clc_radians.cl

diff --git a/libclc/clc/include/clc/common/clc_degrees.h b/libclc/clc/include/clc/common/clc_degrees.h
new file mode 100644
index 0000000000000..e8bb684fcd4d7
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_degrees.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_DEGREES_H__
+#define __CLC_MATH_CLC_DEGREES_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_degrees
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_DEGREES_H__
diff --git a/libclc/clc/include/clc/common/clc_radians.h b/libclc/clc/include/clc/common/clc_radians.h
new file mode 100644
index 0000000000000..80d481e8de723
--- /dev/null
+++ b/libclc/clc/include/clc/common/clc_radians.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_RADIANS_H__
+#define __CLC_MATH_CLC_RADIANS_H__
+
+#define __CLC_BODY <clc/math/unary_decl.inc>
+#define __CLC_FUNCTION __clc_radians
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_RADIANS_H__
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index f3097de694422..d74bff20ba87b 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -1,3 +1,5 @@
+common/clc_degrees.cl
+common/clc_radians.cl
 common/clc_smoothstep.cl
 geometric/clc_dot.cl
 integer/clc_abs.cl
diff --git a/libclc/clc/lib/generic/common/clc_degrees.cl b/libclc/clc/lib/generic/common/clc_degrees.cl
new file mode 100644
index 0000000000000..ce705982072e8
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_degrees.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define DEGREES_SINGLE_DEF(TYPE, LITERAL)                                      \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_degrees(TYPE radians) {                    \
+    return (TYPE)LITERAL * radians;                                            \
+  }
+
+#define DEGREES_DEF(TYPE, LITERAL)                                             \
+  DEGREES_SINGLE_DEF(TYPE, LITERAL)                                            \
+  DEGREES_SINGLE_DEF(TYPE##2, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##3, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##4, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##8, LITERAL)                                         \
+  DEGREES_SINGLE_DEF(TYPE##16, LITERAL)
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(float, 0x1.ca5dc2p+5F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(double, 0x1.ca5dc1a63c1f8p+5)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+DEGREES_DEF(half, (half)0x1.ca5dc1a63c1f8p+5)
+
+#endif
diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl
new file mode 100644
index 0000000000000..850b8eb84f9da
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_radians.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+#define RADIANS_SINGLE_DEF(TYPE, LITERAL)                                      \
+  _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) {                    \
+    return (TYPE)LITERAL * radians;                                            \
+  }
+
+#define RADIANS_DEF(TYPE, LITERAL)                                             \
+  RADIANS_SINGLE_DEF(TYPE, LITERAL)                                            \
+  RADIANS_SINGLE_DEF(TYPE##2, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##3, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##4, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##8, LITERAL)                                         \
+  RADIANS_SINGLE_DEF(TYPE##16, LITERAL)
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(float, 0x1.1df46ap-6F)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(double, 0x1.1df46a2529d39p-6)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6)
+
+#endif
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index 02784b8def682..ac855ea5184ed 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -1,3 +1,5 @@
+../generic/common/clc_degrees.cl
+../generic/common/clc_radians.cl
 ../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index 02784b8def682..ac855ea5184ed 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -1,3 +1,5 @@
+../generic/common/clc_degrees.cl
+../generic/common/clc_radians.cl
 ../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl
index cf49b190c76b3..a9715d64f507a 100644
--- a/libclc/generic/lib/common/degrees.cl
+++ b/libclc/generic/lib/common/degrees.cl
@@ -22,23 +22,20 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_degrees.h>
 
-_CLC_OVERLOAD _CLC_DEF float degrees(float radians) {
-  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-  return 0x1.ca5dc2p+5F * radians;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float);
-
+_CLC_DEFINE_UNARY_BUILTIN(float, degrees, __clc_degrees, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_OVERLOAD _CLC_DEF double degrees(double radians) {
-  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-  return 0x1.ca5dc1a63c1f8p+5 * radians;
-}
+_CLC_DEFINE_UNARY_BUILTIN(double, degrees, __clc_degrees, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double);
+_CLC_DEFINE_UNARY_BUILTIN(half, degrees, __clc_degrees, half)
 
 #endif
diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl
index 645a30549afed..b5dcbfe6e3fd2 100644
--- a/libclc/generic/lib/common/radians.cl
+++ b/libclc/generic/lib/common/radians.cl
@@ -22,23 +22,20 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/common/clc_radians.h>
 
-_CLC_OVERLOAD _CLC_DEF float radians(float degrees) {
-  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-  return 0x1.1df46ap-6F * degrees;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float);
-
+_CLC_DEFINE_UNARY_BUILTIN(float, radians, __clc_radians, float)
 
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_OVERLOAD _CLC_DEF double radians(double degrees) {
-  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-  return 0x1.1df46a2529d39p-6 * degrees;
-}
+_CLC_DEFINE_UNARY_BUILTIN(double, radians, __clc_radians, double)
+
+#endif
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double);
+_CLC_DEFINE_UNARY_BUILTIN(half, radians, __clc_radians, half)
 
 #endif

From 8c63648117f1e1705943903b149f36ab8a4df1e5 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 17 Jan 2025 12:14:20 +0000
Subject: [PATCH 37/45] =?UTF-8?q?Revert=20"Revert=20"[Flang][Driver]=20Add?=
 =?UTF-8?q?=20a=20flag=20to=20control=20zero=20initializa=E2=80=A6=20(#123?=
 =?UTF-8?q?097)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…tion of global v…" (#123067)"

This reverts commit 44ba43aa2b740878d83a9d6f1d52a333c0d48c22.

Adds the flag to bbc as well.
---
 clang/include/clang/Driver/Options.td         |  5 +++++
 clang/lib/Driver/ToolChains/Flang.cpp         |  6 +++--
 flang/include/flang/Lower/LoweringOptions.def |  3 +++
 flang/lib/Frontend/CompilerInvocation.cpp     |  8 +++++++
 flang/lib/Lower/ConvertVariable.cpp           |  6 ++++-
 flang/test/Driver/fno-zero-init.f90           |  9 ++++++++
 flang/test/Lower/zero_init.f90                | 20 +++++++++++++++++
 flang/test/Lower/zero_init_default_init.f90   | 22 +++++++++++++++++++
 flang/tools/bbc/bbc.cpp                       |  6 +++++
 9 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Driver/fno-zero-init.f90
 create mode 100644 flang/test/Lower/zero_init.f90
 create mode 100644 flang/test/Lower/zero_init_default_init.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d38dd2b4e3cf0..c4b9743597bb2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3505,6 +3505,11 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
 def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>,
   Visibility<[ClangOption, FlangOption]>;
+defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero",
+  PosFlag<SetTrue, [], [FlangOption, FC1Option],
+          "Zero initialize globals without default initialization (default)">,
+  NegFlag<SetFalse, [], [FlangOption, FC1Option],
+          "Do not zero initialize globals without default initialization">>;
 def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group<f_Group>;
 def fno_temp_file : Flag<["-"], "fno-temp-file">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText<
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 86ed25badfa2b..9c1fd28a3a8a2 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -155,8 +155,10 @@ void Flang::addCodegenOptions(const ArgList &Args,
                    options::OPT_flang_deprecated_no_hlfir,
                    options::OPT_fno_ppc_native_vec_elem_order,
                    options::OPT_fppc_native_vec_elem_order,
-                   options::OPT_ftime_report, options::OPT_ftime_report_EQ,
-                   options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
+                   options::OPT_finit_global_zero,
+                   options::OPT_fno_init_global_zero, options::OPT_ftime_report,
+                   options::OPT_ftime_report_EQ, options::OPT_funroll_loops,
+                   options::OPT_fno_unroll_loops});
 }
 
 void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index 5a6debfdffe03..396c91948be36 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -44,5 +44,8 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0)
 /// If false, assume that the shapes/types/allocation-status match.
 ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1)
 
+/// If true, initialize globals without initialization to zero.
+/// On by default.
+ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1)
 #undef LOWERINGOPT
 #undef ENUM_LOWERINGOPT
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 15b1e1e0a2488..3c6da4687f65d 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1377,6 +1377,14 @@ bool CompilerInvocation::createFromArgs(
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
   }
 
+  // -f[no-]init-global-zero
+  if (args.hasFlag(clang::driver::options::OPT_finit_global_zero,
+                   clang::driver::options::OPT_fno_init_global_zero,
+                   /*default=*/true))
+    invoc.loweringOpts.setInitGlobalZero(true);
+  else
+    invoc.loweringOpts.setInitGlobalZero(false);
+
   // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or
   // -Rpass-analysis. This will be used later when processing and outputting the
   // remarks generated by LLVM in ExecuteCompilerInvocation.cpp.
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 9ee42d5cd8800..87236dc293ebb 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -635,7 +635,11 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
       global.setLinkName(builder.createCommonLinkage());
     Fortran::lower::createGlobalInitialization(
         builder, global, [&](fir::FirOpBuilder &builder) {
-          mlir::Value initValue = builder.create<fir::ZeroOp>(loc, symTy);
+          mlir::Value initValue;
+          if (converter.getLoweringOptions().getInitGlobalZero())
+            initValue = builder.create<fir::ZeroOp>(loc, symTy);
+          else
+            initValue = builder.create<fir::UndefOp>(loc, symTy);
           builder.create<fir::HasValueOp>(loc, initValue);
         });
   }
diff --git a/flang/test/Driver/fno-zero-init.f90 b/flang/test/Driver/fno-zero-init.f90
new file mode 100644
index 0000000000000..2ffa10dd040d5
--- /dev/null
+++ b/flang/test/Driver/fno-zero-init.f90
@@ -0,0 +1,9 @@
+! Check that the driver passes through -f[no-]init-global-zero:
+! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s
+! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s
+! Check that the compiler accepts -f[no-]init-global-zero:
+! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o -
+! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o -
+
+! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero"
+! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero"
diff --git a/flang/test/Lower/zero_init.f90 b/flang/test/Lower/zero_init.f90
new file mode 100644
index 0000000000000..5ed6f2247de3b
--- /dev/null
+++ b/flang/test/Lower/zero_init.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
+! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s
+
+module m1
+  real :: x
+end module m1
+
+!CHECK-DEFAULT: fir.global @_QMm1Ex : f32 {
+!CHECK-DEFAULT:   %[[UNDEF:.*]] = fir.zero_bits f32
+!CHECK-DEFAULT:   fir.has_value %[[UNDEF]] : f32
+!CHECK-DEFAULT: }
+
+!CHECK-NO-ZERO-INIT: fir.global @_QMm1Ex : f32 {
+!CHECK-NO-ZERO-INIT:   %[[UNDEF:.*]] = fir.undefined f32
+!CHECK-NO-ZERO-INIT:   fir.has_value %[[UNDEF]] : f32
+!CHECK-NO-ZERO-INIT: }
diff --git a/flang/test/Lower/zero_init_default_init.f90 b/flang/test/Lower/zero_init_default_init.f90
new file mode 100644
index 0000000000000..e2d1f545e35a5
--- /dev/null
+++ b/flang/test/Lower/zero_init_default_init.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck %s
+
+! Test that the flag does not affect globals with default init
+
+module m2
+  type val
+    integer :: my_val = 1
+  end type val
+  type(val) :: v1
+end module m2
+
+!CHECK:  fir.global @_QMm2Ev1 : !fir.type<_QMm2Tval{my_val:i32}> {
+!CHECK:    %[[V1:.*]] = fir.undefined !fir.type<_QMm2Tval{my_val:i32}>
+!CHECK:    %[[ONE:.*]] = arith.constant 1 : i32
+!CHECK:    %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMm2Tval{my_val:i32}>] : (!fir.type<_QMm2Tval{my_val:i32}>, i32) -> !fir.type<_QMm2Tval{my_val:i32}>
+!CHECK:    fir.has_value %[[V1_INIT]] : !fir.type<_QMm2Tval{my_val:i32}>
+!CHECK:  }
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 7efc460be8679..dafbcd856389a 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -234,6 +234,11 @@ static llvm::cl::opt<bool> integerWrapAround(
     llvm::cl::desc("Treat signed integer overflow as two's complement"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<bool> initGlobalZero(
+    "finit-global-zero",
+    llvm::cl::desc("Zero initialize globals without default initialization"),
+    llvm::cl::init(true));
+
 static llvm::cl::opt<bool>
     reallocateLHS("frealloc-lhs",
                   llvm::cl::desc("Follow Fortran 2003 rules for (re)allocating "
@@ -381,6 +386,7 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
   loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder);
   loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR);
   loweringOptions.setIntegerWrapAround(integerWrapAround);
+  loweringOptions.setInitGlobalZero(initGlobalZero);
   loweringOptions.setReallocateLHS(reallocateLHS);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
   Fortran::frontend::TargetOptions targetOpts;

From bacfdcd7e0989117a3c76b040fe9efe093fa8708 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 17 Jan 2025 12:22:07 +0000
Subject: [PATCH 38/45] [DAG] Add SDPatternMatch::m_BitCast matcher (#123327)

Simplifies a future patch
---
 llvm/include/llvm/CodeGen/SDPatternMatch.h              | 4 ++++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp           | 2 +-
 llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp | 5 +++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 4faa090901a6a..4488a6152117c 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -896,6 +896,10 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
   return UnaryOpc_match<Opnd, true>(Opc, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_BitCast(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::BITCAST, Op);
+}
+
 template <typename Opnd>
 inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index de7fb21f5903e..49e5b7d9ef014 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15770,7 +15770,7 @@ SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
   // FIXME: I don't think looking for bitcast intrinsically makes sense, but
   // removing this would require more changes.
   auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) {
-    if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT)
+    if (sd_match(Op, m_BitCast(m_SpecificVT(VT))))
       return true;
 
     return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index bf9c597d8ac5e..736a36da97f57 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -392,6 +392,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2);
   SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2);
 
+  SDValue Bcast = DAG->getNode(ISD::BITCAST, DL, FloatVT, Op0);
   SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0);
   SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0);
 
@@ -423,8 +424,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value())));
   EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value())));
 
+  EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_Value())));
+  EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::i32))));
   EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value())));
   EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Bcast, m_BitReverse(m_Value())));
+  EXPECT_FALSE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::f32))));
   EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value())));
   EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value())));
 

From ce3280a64467b5211ced77169f3203c07934e06b Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Fri, 17 Jan 2025 12:25:37 +0000
Subject: [PATCH 39/45] Fix for buildbot errors on non-aarch64 targets.
 (#123322)

Add missing REQUIRES: aarch64-registered-target
---
 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index 90bd98a9b0d38..4b6a19d3f05cf 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4
+
+; REQUIRES: aarch64-registered-target
+
 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"

From 8a229f595a5c0ff354cdfa05cda974a9d56674df Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 17 Jan 2025 12:27:58 +0000
Subject: [PATCH 40/45] =?UTF-8?q?Revert=20"Revert=20"Revert=20"[Flang][Dri?=
 =?UTF-8?q?ver]=20Add=20a=20flag=20to=20control=20zero=20initializa?=
 =?UTF-8?q?=E2=80=A6"=20(#123330)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts llvm/llvm-project#123097

Reverting due to buildbot failure
https://lab.llvm.org/buildbot/#/builders/89/builds/14577.
---
 clang/include/clang/Driver/Options.td         |  5 -----
 clang/lib/Driver/ToolChains/Flang.cpp         |  6 ++---
 flang/include/flang/Lower/LoweringOptions.def |  3 ---
 flang/lib/Frontend/CompilerInvocation.cpp     |  8 -------
 flang/lib/Lower/ConvertVariable.cpp           |  6 +----
 flang/test/Driver/fno-zero-init.f90           |  9 --------
 flang/test/Lower/zero_init.f90                | 20 -----------------
 flang/test/Lower/zero_init_default_init.f90   | 22 -------------------
 flang/tools/bbc/bbc.cpp                       |  6 -----
 9 files changed, 3 insertions(+), 82 deletions(-)
 delete mode 100644 flang/test/Driver/fno-zero-init.f90
 delete mode 100644 flang/test/Lower/zero_init.f90
 delete mode 100644 flang/test/Lower/zero_init_default_init.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c4b9743597bb2..d38dd2b4e3cf0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3505,11 +3505,6 @@ def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
 def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>,
   Visibility<[ClangOption, FlangOption]>;
-defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero",
-  PosFlag<SetTrue, [], [FlangOption, FC1Option],
-          "Zero initialize globals without default initialization (default)">,
-  NegFlag<SetFalse, [], [FlangOption, FC1Option],
-          "Do not zero initialize globals without default initialization">>;
 def fno_pointer_tbaa : Flag<["-"], "fno-pointer-tbaa">, Group<f_Group>;
 def fno_temp_file : Flag<["-"], "fno-temp-file">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, HelpText<
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 9c1fd28a3a8a2..86ed25badfa2b 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -155,10 +155,8 @@ void Flang::addCodegenOptions(const ArgList &Args,
                    options::OPT_flang_deprecated_no_hlfir,
                    options::OPT_fno_ppc_native_vec_elem_order,
                    options::OPT_fppc_native_vec_elem_order,
-                   options::OPT_finit_global_zero,
-                   options::OPT_fno_init_global_zero, options::OPT_ftime_report,
-                   options::OPT_ftime_report_EQ, options::OPT_funroll_loops,
-                   options::OPT_fno_unroll_loops});
+                   options::OPT_ftime_report, options::OPT_ftime_report_EQ,
+                   options::OPT_funroll_loops, options::OPT_fno_unroll_loops});
 }
 
 void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index 396c91948be36..5a6debfdffe03 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -44,8 +44,5 @@ ENUM_LOWERINGOPT(IntegerWrapAround, unsigned, 1, 0)
 /// If false, assume that the shapes/types/allocation-status match.
 ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1)
 
-/// If true, initialize globals without initialization to zero.
-/// On by default.
-ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1)
 #undef LOWERINGOPT
 #undef ENUM_LOWERINGOPT
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 3c6da4687f65d..15b1e1e0a2488 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1377,14 +1377,6 @@ bool CompilerInvocation::createFromArgs(
     invoc.loweringOpts.setNoPPCNativeVecElemOrder(true);
   }
 
-  // -f[no-]init-global-zero
-  if (args.hasFlag(clang::driver::options::OPT_finit_global_zero,
-                   clang::driver::options::OPT_fno_init_global_zero,
-                   /*default=*/true))
-    invoc.loweringOpts.setInitGlobalZero(true);
-  else
-    invoc.loweringOpts.setInitGlobalZero(false);
-
   // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or
   // -Rpass-analysis. This will be used later when processing and outputting the
   // remarks generated by LLVM in ExecuteCompilerInvocation.cpp.
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 87236dc293ebb..9ee42d5cd8800 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -635,11 +635,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
       global.setLinkName(builder.createCommonLinkage());
     Fortran::lower::createGlobalInitialization(
         builder, global, [&](fir::FirOpBuilder &builder) {
-          mlir::Value initValue;
-          if (converter.getLoweringOptions().getInitGlobalZero())
-            initValue = builder.create<fir::ZeroOp>(loc, symTy);
-          else
-            initValue = builder.create<fir::UndefOp>(loc, symTy);
+          mlir::Value initValue = builder.create<fir::ZeroOp>(loc, symTy);
           builder.create<fir::HasValueOp>(loc, initValue);
         });
   }
diff --git a/flang/test/Driver/fno-zero-init.f90 b/flang/test/Driver/fno-zero-init.f90
deleted file mode 100644
index 2ffa10dd040d5..0000000000000
--- a/flang/test/Driver/fno-zero-init.f90
+++ /dev/null
@@ -1,9 +0,0 @@
-! Check that the driver passes through -f[no-]init-global-zero:
-! RUN: %flang -### -S -finit-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-POS %s
-! RUN: %flang -### -S -fno-init-global-zero %s -o - 2>&1 | FileCheck --check-prefix=CHECK-NEG %s
-! Check that the compiler accepts -f[no-]init-global-zero:
-! RUN: %flang_fc1 -emit-hlfir -finit-global-zero %s -o -
-! RUN: %flang_fc1 -emit-hlfir -fno-init-global-zero %s -o -
-
-! CHECK-POS: "-fc1"{{.*}}"-finit-global-zero"
-! CHECK-NEG: "-fc1"{{.*}}"-fno-init-global-zero"
diff --git a/flang/test/Lower/zero_init.f90 b/flang/test/Lower/zero_init.f90
deleted file mode 100644
index 5ed6f2247de3b..0000000000000
--- a/flang/test/Lower/zero_init.f90
+++ /dev/null
@@ -1,20 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
-! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
-! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s
-! RUN: bbc -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
-! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
-! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-NO-ZERO-INIT %s
-
-module m1
-  real :: x
-end module m1
-
-!CHECK-DEFAULT: fir.global @_QMm1Ex : f32 {
-!CHECK-DEFAULT:   %[[UNDEF:.*]] = fir.zero_bits f32
-!CHECK-DEFAULT:   fir.has_value %[[UNDEF]] : f32
-!CHECK-DEFAULT: }
-
-!CHECK-NO-ZERO-INIT: fir.global @_QMm1Ex : f32 {
-!CHECK-NO-ZERO-INIT:   %[[UNDEF:.*]] = fir.undefined f32
-!CHECK-NO-ZERO-INIT:   fir.has_value %[[UNDEF]] : f32
-!CHECK-NO-ZERO-INIT: }
diff --git a/flang/test/Lower/zero_init_default_init.f90 b/flang/test/Lower/zero_init_default_init.f90
deleted file mode 100644
index e2d1f545e35a5..0000000000000
--- a/flang/test/Lower/zero_init_default_init.f90
+++ /dev/null
@@ -1,22 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
-! RUN: %flang_fc1 -finit-global-zero -emit-hlfir -o - %s | FileCheck %s
-! RUN: %flang_fc1 -fno-init-global-zero -emit-hlfir -o - %s | FileCheck %s
-! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
-! RUN: bbc -finit-global-zero -emit-hlfir -o - %s | FileCheck %s
-! RUN: bbc -finit-global-zero=false -emit-hlfir -o - %s | FileCheck %s
-
-! Test that the flag does not affect globals with default init
-
-module m2
-  type val
-    integer :: my_val = 1
-  end type val
-  type(val) :: v1
-end module m2
-
-!CHECK:  fir.global @_QMm2Ev1 : !fir.type<_QMm2Tval{my_val:i32}> {
-!CHECK:    %[[V1:.*]] = fir.undefined !fir.type<_QMm2Tval{my_val:i32}>
-!CHECK:    %[[ONE:.*]] = arith.constant 1 : i32
-!CHECK:    %[[V1_INIT:.*]] = fir.insert_value %[[V1]], %[[ONE]], ["my_val", !fir.type<_QMm2Tval{my_val:i32}>] : (!fir.type<_QMm2Tval{my_val:i32}>, i32) -> !fir.type<_QMm2Tval{my_val:i32}>
-!CHECK:    fir.has_value %[[V1_INIT]] : !fir.type<_QMm2Tval{my_val:i32}>
-!CHECK:  }
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index dafbcd856389a..7efc460be8679 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -234,11 +234,6 @@ static llvm::cl::opt<bool> integerWrapAround(
     llvm::cl::desc("Treat signed integer overflow as two's complement"),
     llvm::cl::init(false));
 
-static llvm::cl::opt<bool> initGlobalZero(
-    "finit-global-zero",
-    llvm::cl::desc("Zero initialize globals without default initialization"),
-    llvm::cl::init(true));
-
 static llvm::cl::opt<bool>
     reallocateLHS("frealloc-lhs",
                   llvm::cl::desc("Follow Fortran 2003 rules for (re)allocating "
@@ -386,7 +381,6 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
   loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder);
   loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR);
   loweringOptions.setIntegerWrapAround(integerWrapAround);
-  loweringOptions.setInitGlobalZero(initGlobalZero);
   loweringOptions.setReallocateLHS(reallocateLHS);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
   Fortran::frontend::TargetOptions targetOpts;

From 22637a877ae7fbfd5cf030400979fd4527eaebcf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 17 Jan 2025 12:52:24 +0000
Subject: [PATCH 41/45] [Loads] Respect UseDerefAtPointSemantics in
 isDerefAndAlignedPointer. (#123196)

If a pointer gets freed, it may not be dereferenceable any longer, even
though there is a dominating dereferenceable assumption. As first step,
only consider assumptions if the pointer value cannot be freed if
UseDerefAtPointSemantics is used.

PR: https://github.com/llvm/llvm-project/pull/123196
---
 llvm/lib/Analysis/Loads.cpp                   |   4 +-
 llvm/lib/IR/Value.cpp                         |   2 +-
 ...able-info-from-assumption-constant-size.ll | 156 +++++++++++++++---
 3 files changed, 138 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 7bbd469bd035d..11ccfa33821ca 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -25,6 +25,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> UseDerefAtPointSemantics;
+
 static bool isAligned(const Value *Base, Align Alignment,
                       const DataLayout &DL) {
   return Base->getPointerAlignment(DL) >= Alignment;
@@ -168,7 +170,7 @@ static bool isDereferenceableAndAlignedPointer(
                                               Size, DL, CtxI, AC, DT, TLI,
                                               Visited, MaxDepth);
 
-  if (CtxI) {
+  if (CtxI && (!UseDerefAtPointSemantics || !V->canBeFreed())) {
     /// Look through assumes to see if both dereferencability and alignment can
     /// be proven by an assume if needed.
     RetainedKnowledge AlignRK;
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 65b63955b6f6d..eddb67282fca4 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -36,7 +36,7 @@
 
 using namespace llvm;
 
-static cl::opt<bool> UseDerefAtPointSemantics(
+cl::opt<bool> UseDerefAtPointSemantics(
     "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
     cl::desc("Deref attributes and metadata infer facts at definition only"));
 
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 572511a5ffb92..90671689f1dce 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics=1 -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics -S %s | FileCheck %s
 
 declare void @llvm.assume(i1)
 
-define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -104,7 +104,7 @@ exit:
   ret void
 }
 
-define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -181,7 +181,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -282,7 +282,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -383,7 +383,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(
 ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -484,7 +484,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -585,7 +585,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -682,7 +682,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -785,7 +785,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree {
+define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -890,7 +890,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -968,7 +968,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1063,7 +1063,7 @@ exit:
   ret void
 }
 
-define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1142,7 +1142,7 @@ exit:
 }
 
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(
 ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1220,7 +1220,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1315,7 +1315,7 @@ exit:
   ret void
 }
 
-define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree {
+define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -1410,8 +1410,8 @@ exit:
   ret void
 }
 
-; %a may be freeed between the dereferenceable assumption and accesses.
-; FIXME: It is not safe to use with -use-dereferenceable-at-point-semantics.
+; %a may be freed between the dereferenceable assumption and accesses.
+; It is not safe to use with -use-dereferenceable-at-point-semantics.
 define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
@@ -1422,16 +1422,29 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
@@ -1491,6 +1504,103 @@ exit:
   ret void
 }
 
+; %a may be freed between the dereferenceable assumption and accesses.
+; It is not safe to use with -use-dereferenceable-at-point-semantics.
+define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %b, ptr noalias %c) nofree nosync {
+; CHECK-LABEL: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(
+; CHECK-SAME: ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = call ptr @get_ptr()
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP10]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP35:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = call ptr @get_ptr()
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+declare ptr @get_ptr()
 declare void @may_free()
 
 ;.
@@ -1528,4 +1638,6 @@ declare void @may_free()
 ; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]}
+; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]}
+; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]}
 ;.

From fb2c9d940ad87e6ae09e06c6915e0c925a4f87ec Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 17 Jan 2025 21:03:53 +0800
Subject: [PATCH 42/45] [C++20] [Modules] Makes sure internal declaration won't
 be found by other TU (#123059)

Close https://github.com/llvm/llvm-project/issues/61427

And this is also helpful to implement
https://github.com/llvm/llvm-project/issues/112294 partially.

The implementation strategy mimics
https://github.com/llvm/llvm-project/pull/122887. This patch split the
internal declarations from the general lookup table so that other TU
can't find the internal declarations.
---
 .../include/clang/Serialization/ASTBitCodes.h |   6 +
 clang/include/clang/Serialization/ASTReader.h |  20 +-
 clang/include/clang/Serialization/ASTWriter.h |  11 +-
 clang/lib/Serialization/ASTReader.cpp         |  89 +++++--
 clang/lib/Serialization/ASTReaderDecl.cpp     |  45 +++-
 clang/lib/Serialization/ASTWriter.cpp         | 225 +++++++++++++-----
 clang/lib/Serialization/ASTWriterDecl.cpp     |  12 +-
 .../basic.lookup.argdep/p5-ex2.cpp            |   4 +-
 .../basic.scope/basic.scope.namespace/p2.cpp  |  12 +-
 .../CXX/module/basic/basic.def.odr/p4.cppm    |   5 -
 .../test/CXX/module/basic/basic.link/p2.cppm  |  13 +-
 11 files changed, 327 insertions(+), 115 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 40dae25f7b54b..d568d2fd7aa30 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -740,6 +740,8 @@ enum ASTRecordTypes {
   CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75,
 
   UPDATE_MODULE_LOCAL_VISIBLE = 76,
+
+  UPDATE_TU_LOCAL_VISIBLE = 77,
 };
 
 /// Record types used within a source manager block.
@@ -1340,6 +1342,10 @@ enum DeclCode {
   /// only visible from DeclContext in the same module.
   DECL_CONTEXT_MODULE_LOCAL_VISIBLE,
 
+  /// A record that stores the set of declarations that are only visible
+  /// to the TU.
+  DECL_CONTEXT_TU_LOCAL_VISIBLE,
+
   /// A LabelDecl record.
   DECL_LABEL,
 
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index c839215dc4077..82564fe664acb 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -528,6 +528,7 @@ class ASTReader
     uint64_t LexicalOffset;
     uint64_t VisibleOffset;
     uint64_t ModuleLocalOffset;
+    uint64_t TULocalOffset;
   };
 
   using DelayedNamespaceOffsetMapTy =
@@ -640,6 +641,9 @@ class ASTReader
   llvm::DenseMap<const DeclContext *,
                  serialization::reader::ModuleLocalLookupTable>
       ModuleLocalLookups;
+  llvm::DenseMap<const DeclContext *,
+                 serialization::reader::DeclContextLookupTable>
+      TULocalLookups;
 
   using SpecLookupTableTy =
       llvm::DenseMap<const Decl *,
@@ -670,6 +674,7 @@ class ASTReader
   llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> PendingVisibleUpdates;
   llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates>
       PendingModuleLocalVisibleUpdates;
+  llvm::DenseMap<GlobalDeclID, DeclContextVisibleUpdates> TULocalUpdates;
 
   using SpecializationsUpdate = SmallVector<UpdateData, 1>;
   using SpecializationsUpdateMap =
@@ -704,11 +709,17 @@ class ASTReader
                                      llvm::BitstreamCursor &Cursor,
                                      uint64_t Offset, DeclContext *DC);
 
+  enum class VisibleDeclContextStorageKind {
+    GenerallyVisible,
+    ModuleLocalVisible,
+    TULocalVisible,
+  };
+
   /// Read the record that describes the visible contents of a DC.
   bool ReadVisibleDeclContextStorage(ModuleFile &M,
                                      llvm::BitstreamCursor &Cursor,
                                      uint64_t Offset, GlobalDeclID ID,
-                                     bool IsModuleLocal);
+                                     VisibleDeclContextStorageKind VisibleKind);
 
   bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor,
                            uint64_t Offset, Decl *D, bool IsPartial);
@@ -1148,6 +1159,10 @@ class ASTReader
   unsigned NumModuleLocalVisibleDeclContexts = 0,
            TotalModuleLocalVisibleDeclContexts = 0;
 
+  /// Number of TU Local decl contexts read/total
+  unsigned NumTULocalVisibleDeclContexts = 0,
+           TotalTULocalVisibleDeclContexts = 0;
+
   /// Total size of modules, in bits, currently loaded
   uint64_t TotalModulesSizeInBits = 0;
 
@@ -1481,6 +1496,9 @@ class ASTReader
   const serialization::reader::ModuleLocalLookupTable *
   getModuleLocalLookupTables(DeclContext *Primary) const;
 
+  const serialization::reader::DeclContextLookupTable *
+  getTULocalLookupTables(DeclContext *Primary) const;
+
   /// Get the loaded specializations lookup tables for \p D,
   /// if any.
   serialization::reader::LazySpecializationInfoLookupTable *
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 53b09cc914392..079e39a9fb678 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener,
   /// file.
   unsigned NumModuleLocalDeclContexts = 0;
 
+  /// The number of TULocal declcontexts written to the AST file.
+  unsigned NumTULocalDeclContexts = 0;
+
   /// A mapping from each known submodule to its ID number, which will
   /// be a positive integer.
   llvm::DenseMap<const Module *, unsigned> SubmoduleIDs;
@@ -594,12 +597,14 @@ class ASTWriter : public ASTDeserializationListener,
   void
   GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC,
                           llvm::SmallVectorImpl<char> &LookupTable,
-                          llvm::SmallVectorImpl<char> &ModuleLocalLookupTable);
+                          llvm::SmallVectorImpl<char> &ModuleLocalLookupTable,
+                          llvm::SmallVectorImpl<char> &TULocalLookupTable);
   uint64_t WriteDeclContextLexicalBlock(ASTContext &Context,
                                         const DeclContext *DC);
   void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC,
                                     uint64_t &VisibleBlockOffset,
-                                    uint64_t &ModuleLocalBlockOffset);
+                                    uint64_t &ModuleLocalBlockOffset,
+                                    uint64_t &TULocalBlockOffset);
   void WriteTypeDeclOffsets();
   void WriteFileDeclIDsMap();
   void WriteComments(ASTContext &Context);
@@ -633,8 +638,10 @@ class ASTWriter : public ASTDeserializationListener,
   unsigned DeclContextLexicalAbbrev = 0;
   unsigned DeclContextVisibleLookupAbbrev = 0;
   unsigned DeclModuleLocalVisibleLookupAbbrev = 0;
+  unsigned DeclTULocalLookupAbbrev = 0;
   unsigned UpdateVisibleAbbrev = 0;
   unsigned ModuleLocalUpdateVisibleAbbrev = 0;
+  unsigned TULocalUpdateVisibleAbbrev = 0;
   unsigned DeclRecordAbbrev = 0;
   unsigned DeclTypedefAbbrev = 0;
   unsigned DeclVarAbbrev = 0;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index d08dc6b1b4d93..a72ff766685bb 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -1425,10 +1425,9 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M,
   return false;
 }
 
-bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M,
-                                              BitstreamCursor &Cursor,
-                                              uint64_t Offset, GlobalDeclID ID,
-                                              bool IsModuleLocal) {
+bool ASTReader::ReadVisibleDeclContextStorage(
+    ModuleFile &M, BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID,
+    ASTReader::VisibleDeclContextStorageKind VisibleKind) {
   assert(Offset != 0);
 
   SavedStreamPosition SavedPosition(Cursor);
@@ -1452,22 +1451,42 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M,
     return true;
   }
   unsigned RecCode = MaybeRecCode.get();
-  if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) {
-    Error("Expected visible lookup table block");
-    return true;
-  }
-  if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) {
-    Error("Expected module local visible lookup table block");
-    return true;
+  switch (VisibleKind) {
+  case VisibleDeclContextStorageKind::GenerallyVisible:
+    if (RecCode != DECL_CONTEXT_VISIBLE) {
+      Error("Expected visible lookup table block");
+      return true;
+    }
+    break;
+  case VisibleDeclContextStorageKind::ModuleLocalVisible:
+    if (RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) {
+      Error("Expected module local visible lookup table block");
+      return true;
+    }
+    break;
+  case VisibleDeclContextStorageKind::TULocalVisible:
+    if (RecCode != DECL_CONTEXT_TU_LOCAL_VISIBLE) {
+      Error("Expected TU local lookup table block");
+      return true;
+    }
+    break;
   }
 
   // We can't safely determine the primary context yet, so delay attaching the
   // lookup table until we're done with recursive deserialization.
   auto *Data = (const unsigned char*)Blob.data();
-  if (!IsModuleLocal)
+  switch (VisibleKind) {
+  case VisibleDeclContextStorageKind::GenerallyVisible:
     PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data});
-  else
+    break;
+  case VisibleDeclContextStorageKind::ModuleLocalVisible:
     PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data});
+    break;
+  case VisibleDeclContextStorageKind::TULocalVisible:
+    if (M.Kind == MK_MainFile)
+      TULocalUpdates[ID].push_back(UpdateData{&M, Data});
+    break;
+  }
   return false;
 }
 
@@ -3613,6 +3632,21 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       break;
     }
 
+    case UPDATE_TU_LOCAL_VISIBLE: {
+      if (F.Kind != MK_MainFile)
+        break;
+      unsigned Idx = 0;
+      GlobalDeclID ID = ReadDeclID(F, Record, Idx);
+      auto *Data = (const unsigned char *)Blob.data();
+      TULocalUpdates[ID].push_back(UpdateData{&F, Data});
+      // If we've already loaded the decl, perform the updates when we finish
+      // loading this block.
+      if (Decl *D = GetExistingDecl(ID))
+        PendingUpdateRecords.push_back(
+            PendingUpdateRecord(ID, D, /*JustLoaded=*/false));
+      break;
+    }
+
     case CXX_ADDED_TEMPLATE_SPECIALIZATION: {
       unsigned Idx = 0;
       GlobalDeclID ID = ReadDeclID(F, Record, Idx);
@@ -3717,6 +3751,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       TotalLexicalDeclContexts += Record[2];
       TotalVisibleDeclContexts += Record[3];
       TotalModuleLocalVisibleDeclContexts += Record[4];
+      TotalTULocalVisibleDeclContexts += Record[5];
       break;
 
     case UNUSED_FILESCOPED_DECLS:
@@ -4002,7 +4037,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       break;
 
     case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: {
-      if (Record.size() % 4 != 0)
+      if (Record.size() % 5 != 0)
         return llvm::createStringError(
             std::errc::illegal_byte_sequence,
             "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST "
@@ -4021,9 +4056,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         uint64_t LocalModuleLocalOffset = Record[I++];
         uint64_t ModuleLocalOffset =
             LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0;
+        uint64_t TULocalLocalOffset = Record[I++];
+        uint64_t TULocalOffset =
+            TULocalLocalOffset ? BaseOffset + TULocalLocalOffset : 0;
 
         DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset,
-                                         ModuleLocalOffset};
+                                         ModuleLocalOffset, TULocalOffset};
 
         assert(!GetExistingDecl(ID) &&
                "We shouldn't load the namespace in the front of delayed "
@@ -8473,6 +8511,15 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC,
     }
   }
 
+  if (auto It = TULocalLookups.find(DC); It != TULocalLookups.end()) {
+    ++NumTULocalVisibleDeclContexts;
+    for (GlobalDeclID ID : It->second.Table.find(Name)) {
+      NamedDecl *ND = cast<NamedDecl>(GetDecl(ID));
+      if (ND->getDeclName() == Name && Found.insert(ND).second)
+        Decls.push_back(ND);
+    }
+  }
+
   SetExternalVisibleDeclsForName(DC, Name, Decls);
   return !Decls.empty();
 }
@@ -8500,6 +8547,7 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) {
 
   findAll(Lookups, NumVisibleDeclContextsRead);
   findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts);
+  findAll(TULocalLookups, NumTULocalVisibleDeclContexts);
 
   for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) {
     SetExternalVisibleDeclsForName(DC, I->first, I->second);
@@ -8519,6 +8567,12 @@ ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const {
   return I == ModuleLocalLookups.end() ? nullptr : &I->second;
 }
 
+const serialization::reader::DeclContextLookupTable *
+ASTReader::getTULocalLookupTables(DeclContext *Primary) const {
+  auto I = TULocalLookups.find(Primary);
+  return I == TULocalLookups.end() ? nullptr : &I->second;
+}
+
 serialization::reader::LazySpecializationInfoLookupTable *
 ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) {
   assert(D->isCanonicalDecl());
@@ -8634,6 +8688,11 @@ void ASTReader::PrintStats() {
         NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts,
         ((float)NumModuleLocalVisibleDeclContexts /
          TotalModuleLocalVisibleDeclContexts * 100));
+  if (TotalTULocalVisibleDeclContexts)
+    std::fprintf(stderr, "  %u/%u visible declcontexts in GMF read (%f%%)\n",
+                 NumTULocalVisibleDeclContexts, TotalTULocalVisibleDeclContexts,
+                 ((float)NumTULocalVisibleDeclContexts /
+                  TotalTULocalVisibleDeclContexts * 100));
   if (TotalNumMethodPoolEntries)
     std::fprintf(stderr, "  %u/%u method pool entries read (%f%%)\n",
                  NumMethodPoolEntriesRead, TotalNumMethodPoolEntries,
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 06dff02ac6128..de834285fa76b 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -414,7 +414,8 @@ class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
   void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D);
 
   void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
-                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset);
+                        uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset,
+                        uint64_t &TULocalOffset);
 
   template <typename T>
   RedeclarableResult VisitRedeclarable(Redeclarable<T> *D);
@@ -1859,7 +1860,9 @@ void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) {
   uint64_t LexicalOffset = 0;
   uint64_t VisibleOffset = 0;
   uint64_t ModuleLocalOffset = 0;
-  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset);
+  uint64_t TULocalOffset = 0;
+  VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset,
+                   TULocalOffset);
   D->IsCBuffer = Record.readBool();
   D->KwLoc = readSourceLocation();
   D->LBraceLoc = readSourceLocation();
@@ -2770,10 +2773,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl(
 
 void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset,
                                      uint64_t &VisibleOffset,
-                                     uint64_t &ModuleLocalOffset) {
+                                     uint64_t &ModuleLocalOffset,
+                                     uint64_t &TULocalOffset) {
   LexicalOffset = ReadLocalOffset();
   VisibleOffset = ReadLocalOffset();
   ModuleLocalOffset = ReadLocalOffset();
+  TULocalOffset = ReadLocalOffset();
 }
 
 template <typename T>
@@ -3903,6 +3908,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
   case DECL_CONTEXT_LEXICAL:
   case DECL_CONTEXT_VISIBLE:
   case DECL_CONTEXT_MODULE_LOCAL_VISIBLE:
+  case DECL_CONTEXT_TU_LOCAL_VISIBLE:
   case DECL_SPECIALIZATIONS:
   case DECL_PARTIAL_SPECIALIZATIONS:
     llvm_unreachable("Record cannot be de-serialized with readDeclRecord");
@@ -4213,9 +4219,10 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
     uint64_t LexicalOffset = 0;
     uint64_t VisibleOffset = 0;
     uint64_t ModuleLocalOffset = 0;
+    uint64_t TULocalOffset = 0;
 
-    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset,
-                            ModuleLocalOffset);
+    Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, ModuleLocalOffset,
+                            TULocalOffset);
 
     // Get the lexical and visible block for the delayed namespace.
     // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap.
@@ -4227,18 +4234,24 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) {
         LexicalOffset = Iter->second.LexicalOffset;
         VisibleOffset = Iter->second.VisibleOffset;
         ModuleLocalOffset = Iter->second.ModuleLocalOffset;
+        TULocalOffset = Iter->second.TULocalOffset;
       }
 
     if (LexicalOffset &&
         ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC))
       return nullptr;
-    if (VisibleOffset &&
-        ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID,
-                                      /*IsModuleLocal=*/false))
+    if (VisibleOffset && ReadVisibleDeclContextStorage(
+                             *Loc.F, DeclsCursor, VisibleOffset, ID,
+                             VisibleDeclContextStorageKind::GenerallyVisible))
       return nullptr;
     if (ModuleLocalOffset &&
-        ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset,
-                                      ID, /*IsModuleLocal=*/true))
+        ReadVisibleDeclContextStorage(
+            *Loc.F, DeclsCursor, ModuleLocalOffset, ID,
+            VisibleDeclContextStorageKind::ModuleLocalVisible))
+      return nullptr;
+    if (TULocalOffset && ReadVisibleDeclContextStorage(
+                             *Loc.F, DeclsCursor, TULocalOffset, ID,
+                             VisibleDeclContextStorageKind::TULocalVisible))
       return nullptr;
   }
   assert(Record.getIdx() == Record.size());
@@ -4404,6 +4417,18 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) {
     DC->setHasExternalVisibleStorage(true);
   }
 
+  if (auto I = TULocalUpdates.find(ID); I != TULocalUpdates.end()) {
+    auto Updates = std::move(I->second);
+    TULocalUpdates.erase(I);
+
+    auto *DC = cast<DeclContext>(D)->getPrimaryContext();
+    for (const auto &Update : Updates)
+      TULocalLookups[DC].Table.add(
+          Update.Mod, Update.Data,
+          reader::ASTDeclContextNameLookupTrait(*this, *Update.Mod));
+    DC->setHasExternalVisibleStorage(true);
+  }
+
   // Load any pending related decls.
   if (D->isCanonicalDecl()) {
     if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) {
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 1c4f5730df312..c7c17e09a30e0 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -4047,6 +4047,13 @@ class ASTDeclContextNameLookupTraitBase {
       : Writer(Writer) {}
 
 public:
+  data_type getData(const DeclIDsTy &LocalIDs) {
+    unsigned Start = DeclIDs.size();
+    for (auto ID : LocalIDs)
+      DeclIDs.push_back(ID);
+    return std::make_pair(Start, DeclIDs.size());
+  }
+
   data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) {
     unsigned Start = DeclIDs.size();
     DeclIDs.insert(
@@ -4139,23 +4146,16 @@ class ASTDeclContextNameLookupTraitBase {
   }
 };
 
-class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
+class ModuleLevelNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 public:
   using primary_module_hash_type = unsigned;
 
   using key_type = std::pair<DeclarationNameKey, primary_module_hash_type>;
   using key_type_ref = key_type;
 
-  explicit ModuleLocalNameLookupTrait(ASTWriter &Writer)
+  explicit ModuleLevelNameLookupTrait(ASTWriter &Writer)
       : ASTDeclContextNameLookupTraitBase(Writer) {}
 
-  data_type getData(const DeclIDsTy &LocalIDs) {
-    unsigned Start = DeclIDs.size();
-    for (auto ID : LocalIDs)
-      DeclIDs.push_back(ID);
-    return std::make_pair(Start, DeclIDs.size());
-  }
-
   static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
 
   hash_value_type ComputeHash(key_type Key) {
@@ -4203,19 +4203,46 @@ static bool isModuleLocalDecl(NamedDecl *D) {
   return false;
 }
 
+static bool isTULocalInNamedModules(NamedDecl *D) {
+  Module *NamedModule = D->getTopLevelOwningNamedModule();
+  if (!NamedModule)
+    return false;
+
+  // For none-top level decls, we choose to move it to the general visible
+  // lookup table. Since the consumer may get its parent somehow and performs
+  // a lookup in it (considering looking up the operator function in lambda).
+  // The difference between module local lookup table and TU local lookup table
+  // is, the consumers still have a chance to lookup in the module local lookup
+  // table but **now** the consumers won't read the TU local lookup table if
+  // the consumer is not the original TU.
+  //
+  // FIXME: It seems to be an optimization chance (and also a more correct
+  // semantics) to remain the TULocal lookup table and performing similar lookup
+  // with the module local lookup table except that we only allow the lookups
+  // with the same module unit.
+  if (!D->getNonTransparentDeclContext()->isFileContext())
+    return false;
+
+  return D->getLinkageInternal() == Linkage::Internal;
+}
+
 // Trait used for the on-disk hash table used in the method pool.
+template <bool CollectingTULocalDecls>
 class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
 public:
-  using ModuleLocalDeclsMapTy =
-      llvm::DenseMap<ModuleLocalNameLookupTrait::key_type, DeclIDsTy>;
-
-private:
-  ModuleLocalDeclsMapTy ModuleLocalDeclsMap;
+  using ModuleLevelDeclsMapTy =
+      llvm::DenseMap<ModuleLevelNameLookupTrait::key_type, DeclIDsTy>;
 
-public:
   using key_type = DeclarationNameKey;
   using key_type_ref = key_type;
 
+  using TULocalDeclsMapTy = llvm::DenseMap<key_type, DeclIDsTy>;
+
+private:
+  ModuleLevelDeclsMapTy ModuleLocalDeclsMap;
+  TULocalDeclsMapTy TULocalDeclsMap;
+
+public:
   explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer)
       : ASTDeclContextNameLookupTraitBase(Writer) {}
 
@@ -4251,15 +4278,30 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase {
         }
       }
 
+      if constexpr (CollectingTULocalDecls) {
+        if (isTULocalInNamedModules(D)) {
+          auto Iter = TULocalDeclsMap.find(D->getDeclName());
+          if (Iter == TULocalDeclsMap.end())
+            TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}});
+          else
+            Iter->second.push_back(ID);
+          continue;
+        }
+      }
+
       DeclIDs.push_back(ID);
     }
     return std::make_pair(Start, DeclIDs.size());
   }
 
-  const ModuleLocalDeclsMapTy &getModuleLocalDecls() {
+  using ASTDeclContextNameLookupTraitBase::getData;
+
+  const ModuleLevelDeclsMapTy &getModuleLocalDecls() {
     return ModuleLocalDeclsMap;
   }
 
+  const TULocalDeclsMapTy &getTULocalDecls() { return TULocalDeclsMap; }
+
   static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; }
 
   hash_value_type ComputeHash(key_type Name) { return Name.getHash(); }
@@ -4487,7 +4529,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer,
 void ASTWriter::GenerateNameLookupTable(
     ASTContext &Context, const DeclContext *ConstDC,
     llvm::SmallVectorImpl<char> &LookupTable,
-    llvm::SmallVectorImpl<char> &ModuleLocalLookupTable) {
+    llvm::SmallVectorImpl<char> &ModuleLocalLookupTable,
+    llvm::SmallVectorImpl<char> &TULookupTable) {
   assert(!ConstDC->hasLazyLocalLexicalLookups() &&
          !ConstDC->hasLazyExternalLexicalLookups() &&
          "must call buildLookups first");
@@ -4497,9 +4540,11 @@ void ASTWriter::GenerateNameLookupTable(
   assert(DC == DC->getPrimaryContext() && "only primary DC has lookup table");
 
   // Create the on-disk hash table representation.
-  MultiOnDiskHashTableGenerator<reader::ASTDeclContextNameLookupTrait,
-                                ASTDeclContextNameLookupTrait> Generator;
-  ASTDeclContextNameLookupTrait Trait(*this);
+  MultiOnDiskHashTableGenerator<
+      reader::ASTDeclContextNameLookupTrait,
+      ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true>>
+      Generator;
+  ASTDeclContextNameLookupTrait</*CollectingTULocal=*/true> Trait(*this);
 
   // The first step is to collect the declaration names which we need to
   // serialize into the name lookup table, and to collect them in a stable
@@ -4671,26 +4716,45 @@ void ASTWriter::GenerateNameLookupTable(
   Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr);
 
   const auto &ModuleLocalDecls = Trait.getModuleLocalDecls();
-  if (ModuleLocalDecls.empty())
-    return;
+  if (!ModuleLocalDecls.empty()) {
+    MultiOnDiskHashTableGenerator<reader::ModuleLocalNameLookupTrait,
+                                  ModuleLevelNameLookupTrait>
+        ModuleLocalLookupGenerator;
+    ModuleLevelNameLookupTrait ModuleLocalTrait(*this);
+
+    for (const auto &ModuleLocalIter : ModuleLocalDecls) {
+      const auto &Key = ModuleLocalIter.first;
+      const auto &IDs = ModuleLocalIter.second;
+      ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs),
+                                        ModuleLocalTrait);
+    }
 
-  MultiOnDiskHashTableGenerator<reader::ModuleLocalNameLookupTrait,
-                                ModuleLocalNameLookupTrait>
-      ModuleLocalLookupGenerator;
-  ModuleLocalNameLookupTrait ModuleLocalTrait(*this);
+    auto *ModuleLocalLookups =
+        Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr;
+    ModuleLocalLookupGenerator.emit(
+        ModuleLocalLookupTable, ModuleLocalTrait,
+        ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr);
+  }
+
+  const auto &TULocalDecls = Trait.getTULocalDecls();
+  if (!TULocalDecls.empty() && !isGeneratingReducedBMI()) {
+    MultiOnDiskHashTableGenerator<
+        reader::ASTDeclContextNameLookupTrait,
+        ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false>>
+        TULookupGenerator;
+    ASTDeclContextNameLookupTrait</*CollectingTULocal=*/false> TULocalTrait(
+        *this);
+
+    for (const auto &TULocalIter : TULocalDecls) {
+      const auto &Key = TULocalIter.first;
+      const auto &IDs = TULocalIter.second;
+      TULookupGenerator.insert(Key, TULocalTrait.getData(IDs), TULocalTrait);
+    }
 
-  for (const auto &ModuleLocalIter : ModuleLocalDecls) {
-    const auto &Key = ModuleLocalIter.first;
-    const auto &IDs = ModuleLocalIter.second;
-    ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs),
-                                      ModuleLocalTrait);
+    auto *TULocalLookups = Chain ? Chain->getTULocalLookupTables(DC) : nullptr;
+    TULookupGenerator.emit(TULookupTable, TULocalTrait,
+                           TULocalLookups ? &TULocalLookups->Table : nullptr);
   }
-
-  auto *ModuleLocalLookups =
-      Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr;
-  ModuleLocalLookupGenerator.emit(
-      ModuleLocalLookupTable, ModuleLocalTrait,
-      ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr);
 }
 
 /// Write the block containing all of the declaration IDs
@@ -4701,7 +4765,12 @@ void ASTWriter::GenerateNameLookupTable(
 void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
                                              DeclContext *DC,
                                              uint64_t &VisibleBlockOffset,
-                                             uint64_t &ModuleLocalBlockOffset) {
+                                             uint64_t &ModuleLocalBlockOffset,
+                                             uint64_t &TULocalBlockOffset) {
+  assert(VisibleBlockOffset == 0);
+  assert(ModuleLocalBlockOffset == 0);
+  assert(TULocalBlockOffset == 0);
+
   // If we imported a key declaration of this namespace, write the visible
   // lookup results as an update record for it rather than including them
   // on this declaration. We will only look at key declarations on reload.
@@ -4788,7 +4857,9 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
-  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable);
+  SmallString<4096> TULookupTable;
+  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable,
+                          TULookupTable);
 
   // Write the lookup table
   RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE};
@@ -4796,17 +4867,26 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context,
                             LookupTable);
   ++NumVisibleDeclContexts;
 
-  if (ModuleLocalLookupTable.empty())
-    return;
+  if (!ModuleLocalLookupTable.empty()) {
+    ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
+    assert(ModuleLocalBlockOffset > VisibleBlockOffset);
+    // Write the lookup table
+    RecordData::value_type ModuleLocalRecord[] = {
+        DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
+    Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev,
+                              ModuleLocalRecord, ModuleLocalLookupTable);
+    ++NumModuleLocalDeclContexts;
+  }
 
-  ModuleLocalBlockOffset = Stream.GetCurrentBitNo();
-  assert(ModuleLocalBlockOffset > VisibleBlockOffset);
-  // Write the lookup table
-  RecordData::value_type ModuleLocalRecord[] = {
-      DECL_CONTEXT_MODULE_LOCAL_VISIBLE};
-  Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev,
-                            ModuleLocalRecord, ModuleLocalLookupTable);
-  ++NumModuleLocalDeclContexts;
+  if (!TULookupTable.empty()) {
+    TULocalBlockOffset = Stream.GetCurrentBitNo();
+    // Write the lookup table
+    RecordData::value_type TULocalDeclsRecord[] = {
+        DECL_CONTEXT_TU_LOCAL_VISIBLE};
+    Stream.EmitRecordWithBlob(DeclTULocalLookupAbbrev, TULocalDeclsRecord,
+                              TULookupTable);
+    ++NumTULocalDeclContexts;
+  }
 }
 
 /// Write an UPDATE_VISIBLE block for the given context.
@@ -4824,7 +4904,9 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context,
   // Create the on-disk hash table in a buffer.
   SmallString<4096> LookupTable;
   SmallString<4096> ModuleLocalLookupTable;
-  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable);
+  SmallString<4096> TULookupTable;
+  GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable,
+                          TULookupTable);
 
   // If we're updating a namespace, select a key declaration as the key for the
   // update record; those are the only ones that will be checked on reload.
@@ -4836,14 +4918,20 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context,
                                      getDeclID(cast<Decl>(DC)).getRawValue()};
   Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable);
 
-  if (ModuleLocalLookupTable.empty())
-    return;
+  if (!ModuleLocalLookupTable.empty()) {
+    // Write the module local lookup table
+    RecordData::value_type ModuleLocalRecord[] = {
+        UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
+    Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord,
+                              ModuleLocalLookupTable);
+  }
 
-  // Write the module local lookup table
-  RecordData::value_type ModuleLocalRecord[] = {
-      UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
-  Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord,
-                            ModuleLocalLookupTable);
+  if (!TULookupTable.empty()) {
+    RecordData::value_type GMFRecord[] = {
+        UPDATE_TU_LOCAL_VISIBLE, getDeclID(cast<Decl>(DC)).getRawValue()};
+    Stream.EmitRecordWithBlob(TULocalUpdateVisibleAbbrev, GMFRecord,
+                              TULookupTable);
+  }
 }
 
 /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions.
@@ -6031,9 +6119,12 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot,
   }
 
   // Some simple statistics
-  RecordData::value_type Record[] = {
-      NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts,
-      NumModuleLocalDeclContexts};
+  RecordData::value_type Record[] = {NumStatements,
+                                     NumMacros,
+                                     NumLexicalDeclContexts,
+                                     NumVisibleDeclContexts,
+                                     NumModuleLocalDeclContexts,
+                                     NumTULocalDeclContexts};
   Stream.EmitRecord(STATISTICS, Record);
   Stream.ExitBlock();
   Stream.FlushToWord();
@@ -6112,7 +6203,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS);
     uint64_t VisibleOffset = 0;
     uint64_t ModuleLocalOffset = 0;
-    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset);
+    uint64_t TULocalOffset = 0;
+    WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset,
+                                 TULocalOffset);
 
     // Write the offset relative to current block.
     if (LexicalOffset)
@@ -6124,10 +6217,14 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
     if (ModuleLocalOffset)
       ModuleLocalOffset -= DeclTypesBlockStartOffset;
 
+    if (TULocalOffset)
+      TULocalOffset -= DeclTypesBlockStartOffset;
+
     AddDeclRef(NS, DelayedNamespaceRecord);
     DelayedNamespaceRecord.push_back(LexicalOffset);
     DelayedNamespaceRecord.push_back(VisibleOffset);
     DelayedNamespaceRecord.push_back(ModuleLocalOffset);
+    DelayedNamespaceRecord.push_back(TULocalOffset);
   }
 
   // The process of writing lexical and visible block for delayed namespace
@@ -6213,6 +6310,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) {
   Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
   ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  Abv = std::make_shared<llvm::BitCodeAbbrev>();
+  Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_TU_LOCAL_VISIBLE));
+  Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6));
+  Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob));
+  TULocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   // And a visible updates block for the translation unit.
   WriteDeclContextVisibleUpdate(Context, TU);
 
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 7a494cfe1ac64..30b28057f4c10 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -2069,6 +2069,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
   uint64_t LexicalOffset = 0;
   uint64_t VisibleOffset = 0;
   uint64_t ModuleLocalOffset = 0;
+  uint64_t TULocalOffset = 0;
 
   if (Writer.isGeneratingReducedBMI() && isa<NamespaceDecl>(DC) &&
       cast<NamespaceDecl>(DC)->isFromExplicitGlobalModule()) {
@@ -2080,12 +2081,14 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) {
     LexicalOffset =
         Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC);
     Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC,
-                                        VisibleOffset, ModuleLocalOffset);
+                                        VisibleOffset, ModuleLocalOffset,
+                                        TULocalOffset);
   }
 
   Record.AddOffset(LexicalOffset);
   Record.AddOffset(VisibleOffset);
   Record.AddOffset(ModuleLocalOffset);
+  Record.AddOffset(TULocalOffset);
 }
 
 const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) {
@@ -2441,6 +2444,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ModuleLocalOffset
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TULocalOffset
   DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_RECORD
@@ -2494,6 +2498,7 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LexicalOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // VisibleOffset
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // ModuleLocalOffset
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TULocalOffset
   DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_PARM_VAR
@@ -2836,6 +2841,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
   DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_TU_LOCAL_VISIBLE));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  DeclTULocalLookupAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
index a27946bd90a46..c200abafc0af8 100644
--- a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
+++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp
@@ -61,6 +61,6 @@ void test() {
 
   // error: S::f is visible in instantiation context, but  R::g has internal
   // linkage and cannot be used outside N.cpp
-  apply(x, S::Z()); // expected-error@N.cpp:10 {{no matching function for call to 'g'}}
-                    // expected-note@-1 {{in instantiation of function template specialization 'apply<R::X, S::Z>' requested here}}
+  apply(x, S::Z()); // expected-error@N.cpp:10 {{use of undeclared identifier 'g'}}
+                    // expected-note@-1 {{requested here}}
 }
diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
index 54ec6aa61ec37..d70eb7de22c6a 100644
--- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
+++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp
@@ -66,11 +66,7 @@ void test_late() {
   // expected-note@p2.cpp:18 {{'exported' declared here}}
 #endif
 
-  internal = 1;
-#ifndef IMPLEMENTATION
-  // expected-error@-2 {{declaration of 'internal' must be imported from module 'A' before it is required}}
-  // expected-note@p2.cpp:20 {{declaration here is not visible}}
-#endif
+  internal = 1; // expected-error {{use of undeclared identifier 'internal'}}
 
   not_exported_private = 1;
 #ifndef IMPLEMENTATION
@@ -78,11 +74,7 @@ void test_late() {
   // expected-error@-3 {{undeclared identifier}}
 #endif
 
-  internal_private = 1;
-#ifndef IMPLEMENTATION
-  // FIXME: should not be visible here
-  // expected-error@-3 {{undeclared identifier}}
-#endif
+  internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}}
 }
 
 #endif
diff --git a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
index 487dbdef283ee..7e88cbe78b4e3 100644
--- a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
+++ b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm
@@ -128,7 +128,6 @@ void f(a::b, a::c) {}
 //
 // CHECK-DAG: @_ZW6Module25extern_var_module_linkage = external {{(dso_local )?}}global
 // CHECK-DAG: @_ZW6Module25inline_var_module_linkage = linkonce_odr {{(dso_local )?}}global
-// CHECK-DAG: @_ZL25static_var_module_linkage = internal {{(dso_local )?}}global i32 0,
 // CHECK-DAG: @_ZW6Module24const_var_module_linkage = available_externally {{(dso_local )?}}constant i32 3,
 
 module Module;
@@ -152,10 +151,6 @@ void use() {
   (void)&extern_var_module_linkage;
   (void)&inline_var_module_linkage;
 
-  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
-  // should not be not visible here.
-  (void)&static_var_module_linkage; // FIXME: Should not be visible here.
-
   (void)&const_var_module_linkage; // FIXME: will be visible after P2788R0
 }
 
diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm
index 5a497304201dc..d7d2b5992a235 100644
--- a/clang/test/CXX/module/basic/basic.link/p2.cppm
+++ b/clang/test/CXX/module/basic/basic.link/p2.cppm
@@ -45,16 +45,14 @@ module M;
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();
-  internal_linkage_fn(); // expected-error {{no matching function for call to 'internal_linkage_fn'}}
+  internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} // expected-note@* {{}}
   (void)external_linkage_class{};
   (void)module_linkage_class{};
   (void)external_linkage_var;
   (void)module_linkage_var;
 
-  // FIXME: Issue #61427 Internal-linkage declarations in the interface TU
-  // should not be not visible here.
-  (void)internal_linkage_class{};
-  (void)internal_linkage_var;
+  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}}
+  (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}}
 }
 
 //--- user.cpp
@@ -63,11 +61,10 @@ import M;
 void use_from_module_impl() {
   external_linkage_fn();
   module_linkage_fn();   // expected-error {{use of undeclared identifier 'module_linkage_fn'}}
-  internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}}
+  internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}}
   (void)external_linkage_class{};
-  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
   (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
-  // expected-note@M.cppm:10 {{declaration here is not visible}}
   (void)external_linkage_var;
   (void)module_linkage_var; // expected-error {{undeclared identifier}}
   (void)internal_linkage_var; // expected-error {{undeclared identifier}}

From 41f430a48db992477534b65b288b47d487c4797d Mon Sep 17 00:00:00 2001
From: Wesley Wiser <wwiser@gmail.com>
Date: Fri, 17 Jan 2025 07:09:00 -0600
Subject: [PATCH 43/45] [X86] Don't fold very large offsets into addr
 displacements during ISel  (#121678)

Doing so can cause the resulting displacement after frame layout to
become inexpressible (or cause over/underflow currently during frame
layout).

Fixes the error reported in
https://github.com/llvm/llvm-project/pull/101840#issuecomment-2306975944.
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp   | 18 +++++----
 llvm/test/CodeGen/X86/dag-large-offset.ll | 47 +++++++++++++++++++++++
 llvm/test/CodeGen/X86/xor-lea.ll          |  3 +-
 3 files changed, 60 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/dag-large-offset.ll

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9b340a778b36a..84bcdae520885 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1800,10 +1800,10 @@ void X86DAGToDAGISel::emitFunctionEntryCode() {
     emitSpecialCodeForMain();
 }
 
-static bool isDispSafeForFrameIndex(int64_t Val) {
-  // On 64-bit platforms, we can run into an issue where a frame index
+static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) {
+  // We can run into an issue where a frame index or a register base
   // includes a displacement that, when added to the explicit displacement,
-  // will overflow the displacement field. Assuming that the frame index
+  // will overflow the displacement field. Assuming that the
   // displacement fits into a 31-bit integer  (which is only slightly more
   // aggressive than the current fundamental assumption that it fits into
   // a 32-bit integer), a 31-bit disp should always be safe.
@@ -1831,7 +1831,7 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     // In addition to the checks required for a register base, check that
     // we do not try to use an unsafe Disp with a frame index.
     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
-        !isDispSafeForFrameIndex(Val))
+        !isDispSafeForFrameIndexOrRegBase(Val))
       return true;
     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
     // 64 bits. Instructions with 32-bit register addresses perform this zero
@@ -1849,10 +1849,14 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     // to get an address size override to be emitted. However, this
     // pseudo-register is not part of any register class and therefore causes
     // MIR verification to fail.
-    if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
+    if (Subtarget->isTarget64BitILP32() &&
+        !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) &&
         !AM.hasBaseOrIndexReg())
       return true;
-  }
+  } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val))
+    // For 32-bit X86, make sure the displacement still isn't close to the
+    // expressible limit.
+    return true;
   AM.Disp = Val;
   return false;
 }
@@ -2553,7 +2557,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   case ISD::FrameIndex:
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
-        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) {
       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
       return false;
diff --git a/llvm/test/CodeGen/X86/dag-large-offset.ll b/llvm/test/CodeGen/X86/dag-large-offset.ll
new file mode 100644
index 0000000000000..2774a93993153
--- /dev/null
+++ b/llvm/test/CodeGen/X86/dag-large-offset.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i386 --frame-pointer=all | FileCheck %s
+
+; ISel will try to fold pointer arithmetic into the address displacement. However, we don't
+; want to do that if the offset is very close to the expressible limit because the final frame
+; layout may push it over/under the limit.
+
+define i32 @foo(i1 %b) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset %ebp, -8
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl __stack_chk_guard, %eax
+; CHECK-NEXT:    movl %eax, -4(%ebp)
+; CHECK-NEXT:    testb $1, 8(%ebp)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movl $-2147483647, %eax # imm = 0x80000001
+; CHECK-NEXT:    leal -5(%ebp,%eax), %eax
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    movl __stack_chk_guard, %ecx
+; CHECK-NEXT:    cmpl -4(%ebp), %ecx
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    .cfi_def_cfa %esp, 4
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    .cfi_def_cfa %ebp, 8
+; CHECK-NEXT:    calll __stack_chk_fail
+entry:
+  %a = alloca i8, align 1
+  %0 = ptrtoint ptr %a to i32
+  %sub = add i32 %0, -2147483647
+  %retval.0 = select i1 %b, i32 %sub, i32 0
+  ret i32 %retval.0
+}
+
+attributes #0 = { sspreq }
diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll
index 10e9525a2706a..d50752e48d293 100644
--- a/llvm/test/CodeGen/X86/xor-lea.ll
+++ b/llvm/test/CodeGen/X86/xor-lea.ll
@@ -327,7 +327,8 @@ define i32 @xor_shl_sminval_i32(i32 %x) {
 ; X86-LABEL: xor_shl_sminval_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal -2147483648(,%eax,8), %eax
+; X86-NEXT:    movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    leal (%ecx,%eax,8), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: xor_shl_sminval_i32:

From 5153a90453e692b834e38eec247a0c88a0678bfa Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 17 Jan 2025 13:09:52 +0000
Subject: [PATCH 44/45] [lldb][DWARF] Change GetAttributes to always visit
 current DIE before recursing (#123261)

`GetAttributes` returns all attributes on a given DIE, including any
attributes that the DIE references via `DW_AT_abstract_origin` and
`DW_AT_specification`. However, if an attribute exists on both the
referring DIE and the referenced DIE, the first one encountered will be
the one that takes precendence when querying the returned
`DWARFAttributes`. But there was no guarantee in which order those
attributes get visited. That means there's no convenient way of ensuring
that an attribute of a definition doesn't get shadowed by one found on
the declaration. One use-case where we don't want this to happen is for
`DW_AT_object_pointer` (which can exist on both definitions and
declarations, see https://github.com/llvm/llvm-project/pull/123089).

This patch makes sure we visit the current DIE's attributes before
following DIE references. I tried keeping as much of the original
`GetAttributes` unchanged and just add an outer `GetAttributes` that
keeps track of the DIEs we need to visit next.

There's precendent for this iteration order in
`llvm::DWARFDie::findRecursively` and also
`lldb_private::ElaboratingDIEIterator`. We could use the latter to
implement `GetAttributes`, though it also follows `DW_AT_signature` so I
decided to leave it for follow-up.
---
 .../SymbolFile/DWARF/DWARFDebugInfoEntry.cpp  |  83 ++-
 .../SymbolFile/DWARF/DWARFDebugInfoEntry.h    |  32 +-
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |   5 +-
 .../SymbolFile/DWARF/DWARFDIETest.cpp         | 640 ++++++++++++++++++
 4 files changed, 727 insertions(+), 33 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 6d073411de876..c2edc52aa964f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -281,22 +281,34 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
   return !ranges.empty();
 }
 
-// Get all attribute values for a given DIE, including following any
-// specification or abstract origin attributes and including those in the
-// results. Any duplicate attributes will have the first instance take
-// precedence (this can happen for declaration attributes).
-void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
-                                        DWARFAttributes &attributes,
-                                        Recurse recurse,
-                                        uint32_t curr_depth) const {
-  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu);
-  if (!abbrevDecl) {
-    attributes.Clear();
-    return;
-  }
+/// Helper for the public \ref DWARFDebugInfoEntry::GetAttributes API.
+/// Adds all attributes of the DIE at the top of the \c worklist to the
+/// \c attributes list. Specifcations and abstract origins are added
+/// to the \c worklist if the referenced DIE has not been seen before.
+static bool GetAttributes(llvm::SmallVector<DWARFDIE> &worklist,
+                          llvm::SmallSet<DWARFDebugInfoEntry const *, 3> &seen,
+                          DWARFAttributes &attributes) {
+  assert(!worklist.empty() && "Need at least one DIE to visit.");
+  assert(seen.size() >= 1 &&
+         "Need to have seen at least the currently visited entry.");
+
+  DWARFDIE current = worklist.pop_back_val();
+
+  const auto *cu = current.GetCU();
+  assert(cu);
+
+  const auto *entry = current.GetDIE();
+  assert(entry);
+
+  const auto *abbrevDecl =
+      entry->GetAbbreviationDeclarationPtr(current.GetCU());
+  if (!abbrevDecl)
+    return false;
 
   const DWARFDataExtractor &data = cu->GetData();
-  lldb::offset_t offset = GetFirstAttributeOffset();
+  lldb::offset_t offset = current.GetDIE()->GetFirstAttributeOffset();
+
+  const bool is_first_die = seen.size() == 1;
 
   for (const auto &attribute : abbrevDecl->attributes()) {
     DWARFFormValue form_value(cu);
@@ -309,10 +321,10 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
     switch (attr) {
     case DW_AT_sibling:
     case DW_AT_declaration:
-      if (curr_depth > 0) {
+      if (!is_first_die) {
         // This attribute doesn't make sense when combined with the DIE that
         // references this DIE. We know a DIE is referencing this DIE because
-        // curr_depth is not zero
+        // we've visited more than one DIE already.
         break;
       }
       [[fallthrough]];
@@ -321,13 +333,12 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
       break;
     }
 
-    if (recurse == Recurse::yes &&
-        ((attr == DW_AT_specification) || (attr == DW_AT_abstract_origin))) {
+    if (attr == DW_AT_specification || attr == DW_AT_abstract_origin) {
       if (form_value.ExtractValue(data, &offset)) {
-        DWARFDIE spec_die = form_value.Reference();
-        if (spec_die)
-          spec_die.GetDIE()->GetAttributes(spec_die.GetCU(), attributes,
-                                           recurse, curr_depth + 1);
+        if (DWARFDIE spec_die = form_value.Reference()) {
+          if (seen.insert(spec_die.GetDIE()).second)
+            worklist.push_back(spec_die);
+        }
       }
     } else {
       const dw_form_t form = form_value.Form();
@@ -339,6 +350,34 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu,
         DWARFFormValue::SkipValue(form, data, &offset, cu);
     }
   }
+
+  return true;
+}
+
+DWARFAttributes DWARFDebugInfoEntry::GetAttributes(const DWARFUnit *cu,
+                                                   Recurse recurse) const {
+  // FIXME: use ElaboratingDIEIterator to follow specifications/abstract origins
+  // instead of maintaining our own worklist/seen list.
+
+  DWARFAttributes attributes;
+
+  llvm::SmallVector<DWARFDIE, 3> worklist;
+  worklist.emplace_back(cu, this);
+
+  // Keep track if DIEs already seen to prevent infinite recursion.
+  // Value of '3' was picked for the same reason that
+  // DWARFDie::findRecursively does.
+  llvm::SmallSet<DWARFDebugInfoEntry const *, 3> seen;
+  seen.insert(this);
+
+  do {
+    if (!::GetAttributes(worklist, seen, attributes)) {
+      attributes.Clear();
+      break;
+    }
+  } while (!worklist.empty() && recurse == Recurse::yes);
+
+  return attributes;
 }
 
 // GetAttributeValue
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index de6bbf1d52789..72aeb2743b1e2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -52,12 +52,28 @@ class DWARFDebugInfoEntry {
                lldb::offset_t *offset_ptr);
 
   using Recurse = DWARFBaseDIE::Recurse;
-  DWARFAttributes GetAttributes(DWARFUnit *cu,
-                                Recurse recurse = Recurse::yes) const {
-    DWARFAttributes attrs;
-    GetAttributes(cu, attrs, recurse, 0 /* curr_depth */);
-    return attrs;
-  }
+
+  /// Get all attribute values for a given DIE, optionally following any
+  /// specifications and abstract origins and including their attributes
+  /// in the result too.
+  ///
+  /// When following specifications/abstract origins, the attributes
+  /// on the referring DIE are guaranteed to be visited before the attributes of
+  /// the referenced DIE.
+  ///
+  /// \param[in] cu DWARFUnit that this entry belongs to.
+  ///
+  /// \param[in] recurse If set to \c Recurse::yes, will include attributes
+  /// on DIEs referenced via \c DW_AT_specification and \c DW_AT_abstract_origin
+  /// (including across multiple levels of indirection).
+  ///
+  /// \returns DWARFAttributes that include all attributes found on this DIE
+  /// (and possibly referenced DIEs). Attributes may appear multiple times
+  /// (e.g., if a declaration and definition both specify the same attribute).
+  /// On failure, the returned DWARFAttributes will be empty.
+  ///
+  DWARFAttributes GetAttributes(const DWARFUnit *cu,
+                                Recurse recurse = Recurse::yes) const;
 
   dw_offset_t GetAttributeValue(const DWARFUnit *cu, const dw_attr_t attr,
                                 DWARFFormValue &formValue,
@@ -178,10 +194,6 @@ class DWARFDebugInfoEntry {
   /// A copy of the DW_TAG value so we don't have to go through the compile
   /// unit abbrev table
   dw_tag_t m_tag = llvm::dwarf::DW_TAG_null;
-
-private:
-  void GetAttributes(DWARFUnit *cu, DWARFAttributes &attrs, Recurse recurse,
-                     uint32_t curr_depth) const;
 };
 } // namespace dwarf
 } // namespace lldb_private::plugin
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 2f451d173c4dd..ad5005b660c64 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -3414,7 +3414,10 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
       mangled = form_value.AsCString();
       break;
     case DW_AT_type:
-      type_die_form = form_value;
+      // DW_AT_type on declaration may be less accurate than
+      // that of definition, so don't overwrite it.
+      if (!type_die_form.IsValid())
+        type_die_form = form_value;
       break;
     case DW_AT_external:
       is_external = form_value.Boolean();
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 1e4c8f3ba0778..3f61d1607073c 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -394,3 +394,643 @@ TEST(DWARFDIETest, GetContextInFunction) {
   EXPECT_THAT(foo_struct_die.GetTypeLookupContext(),
               testing::ElementsAre(make_struct("struct_t")));
 }
+
+struct GetAttributesTestFixture : public testing::TestWithParam<dw_attr_t> {};
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_IterationOrder) {
+  // Tests that we accumulate all current DIE's attributes first
+  // before checking the attributes of the specification.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_high_pc [DW_FORM_data4]
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_low_pc [DW_FORM_data4]
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xdeadbeef
+            - Value:           0x0
+            - Value:           0x1
+            - Value:           0x1
+            - Value:           0xdeadbeef
+
+#     DW_TAG_subprogram
+#       DW_AT_high_pc [DW_FORM_data4]
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+#       DW_AT_low_pc [DW_FORM_data4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xf00dcafe
+            - Value:           0xf
+            - Value:           0xf00dcafe
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+  ASSERT_FALSE(definition.GetAttributeValueAsOptionalUnsigned(DW_AT_external));
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 7U);
+
+  // Check that the attributes on the definition (that are also present
+  // on the declaration) take precedence.
+  for (auto attr : {DW_AT_low_pc, DW_AT_high_pc}) {
+    auto idx = attrs.FindAttributeIndex(attr);
+    EXPECT_NE(idx, UINT32_MAX);
+
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(idx, form_value);
+    EXPECT_TRUE(success);
+
+    EXPECT_EQ(form_value.Unsigned(), 0xf00dcafe);
+  }
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_Cycle) {
+  // Tests that GetAttributes can deal with cycles in
+  // specifications/abstract origins.
+  //
+  // Contrived example:
+  //
+  // func1 -> func3
+  //   ^       |
+  //   |       v
+  //   +------func2
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x19
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xf
+
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x14
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto func1 = cu_die.GetFirstChild();
+  ASSERT_TRUE(func1.IsValid());
+  ASSERT_EQ(func1.Tag(), DW_TAG_subprogram);
+
+  auto func2 = func1.GetSibling();
+  ASSERT_TRUE(func2.IsValid());
+  ASSERT_EQ(func2.Tag(), DW_TAG_subprogram);
+
+  auto func3 = func2.GetSibling();
+  ASSERT_TRUE(func3.IsValid());
+  ASSERT_EQ(func3.Tag(), DW_TAG_subprogram);
+
+  auto attrs = func1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 3U);
+
+  // Confirm that the specifications do form a cycle.
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(0, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func3);
+  }
+
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(1, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func2);
+  }
+
+  {
+    DWARFFormValue form_value;
+    auto success = attrs.ExtractFormValueAtIndex(2, form_value);
+    ASSERT_TRUE(success);
+
+    EXPECT_EQ(form_value.Reference(), func1);
+  }
+}
+
+TEST_P(GetAttributesTestFixture,
+       TestGetAttributes_SkipNonApplicableAttributes) {
+  // Tests that GetAttributes will omit attributes found through
+  // specifications/abstract origins which are not applicable.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_sibling
+              Form:            DW_FORM_ref4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_sibling
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_declaration
+#       DW_AT_name [DW_FORM_strp] ("func")
+#       DW_AT_sibling
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x1
+            - Value:           0x0
+            - Value:           0x18
+
+#     DW_TAG_subprogram
+#       DW_AT_declaration
+#       DW_AT_specification [DW_FORM_ref4] ("func")
+#       DW_AT_sibling
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0xf
+            - Value:           0xdeadbeef
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 4U);
+  EXPECT_NE(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+
+  auto sibling_idx = attrs.FindAttributeIndex(DW_AT_sibling);
+  EXPECT_NE(sibling_idx, UINT32_MAX);
+
+  DWARFFormValue form_value;
+  auto success = attrs.ExtractFormValueAtIndex(sibling_idx, form_value);
+  ASSERT_TRUE(success);
+
+  EXPECT_EQ(form_value.Unsigned(), 0xdeadbeef);
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_NoRecurse) {
+  // Tests that GetAttributes will not recurse if Recurse::No is passed to it.
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_data4
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_low_pc [DW_FORM_data4]
+#       DW_AT_specification [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xdeadbeef
+            - Value:           0xf
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::no);
+  EXPECT_EQ(attrs.Size(), 2U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(DW_AT_low_pc), UINT32_MAX);
+}
+
+TEST_P(GetAttributesTestFixture, TestGetAttributes_InvalidSpec) {
+  // Test that GetAttributes doesn't try following invalid
+  // specifications (but still add it to the list of attributes).
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       {0}
+              Form:            DW_FORM_ref4
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x0
+
+#     DW_TAG_subprogram
+#       DW_AT_specification [DW_FORM_ref4]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0xdeadbeef
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str());
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto declaration = cu_die.GetFirstChild();
+  ASSERT_TRUE(declaration.IsValid());
+  ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram);
+
+  auto definition = declaration.GetSibling();
+  ASSERT_TRUE(definition.IsValid());
+  ASSERT_EQ(definition.Tag(), DW_TAG_subprogram);
+
+  auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 1U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX);
+  EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX);
+}
+
+TEST(DWARFDIETest, TestGetAttributes_Worklist) {
+  // Test that GetAttributes will follow both the abstract origin
+  // and specification on a single DIE correctly (omitting non-applicable
+  // attributes in the process).
+
+  // Contrived example where
+  // f1---> f2 --> f4
+  //    `-> f3 `-> f5
+  //
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - foo
+    - bar
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_abstract_origin
+              Form:            DW_FORM_ref4
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_artificial
+              Form:            DW_FORM_flag_present
+
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram ("f1")
+#       DW_AT_specification [DW_FORM_ref4] ("f2")
+#       DW_AT_abstract_origin [DW_FORM_ref4] ("f3")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x18
+            - Value:           0x21
+
+#     DW_TAG_subprogram ("f2")
+#       DW_AT_specification [DW_FORM_ref4] ("f4")
+#       DW_AT_abstract_origin [DW_FORM_ref4] ("f5")
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x22
+            - Value:           0x23
+
+#     DW_TAG_subprogram ("f3")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+#     DW_TAG_subprogram ("f4")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+#     DW_TAG_subprogram ("f5")
+#       DW_AT_declaration [DW_FORM_flag_present]
+#       DW_AT_artificial [DW_FORM_flag_present]
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x1
+            - Value:           0x1
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto f1 = cu_die.GetFirstChild();
+  ASSERT_TRUE(f1.IsValid());
+  ASSERT_EQ(f1.Tag(), DW_TAG_subprogram);
+
+  auto attrs = f1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes);
+  EXPECT_EQ(attrs.Size(), 7U);
+  EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_declaration), UINT32_MAX);
+}
+
+INSTANTIATE_TEST_SUITE_P(GetAttributeTests, GetAttributesTestFixture,
+                         testing::Values(DW_AT_specification,
+                                         DW_AT_abstract_origin));

From eff6b642583ace53aaed7947b92a43bcba283866 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 17 Jan 2025 13:19:11 +0000
Subject: [PATCH 45/45] [AArch64][GlobalISel] Update and regenerate some
 vecreduce and other tests. NFC

---
 .../GlobalISel/legalize-reduce-add.mir        |  112 +-
 llvm/test/CodeGen/AArch64/aarch64-addv.ll     |  230 +--
 llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll  |  178 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 1751 +++++++++--------
 4 files changed, 1129 insertions(+), 1142 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
index 253e6ebe793ce..76fdfd0c301f6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
@@ -6,15 +6,15 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v16s8
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>))
     %2:_(s8) = G_VECREDUCE_ADD %1(<16 x s8>)
@@ -29,15 +29,15 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v8s16
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>)
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16)
-    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load (<8 x s16>))
     %2:_(s16) = G_VECREDUCE_ADD %1(<8 x s16>)
@@ -52,14 +52,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v4s32
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>)
-    ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load (<4 x s32>))
     %2:_(s32) = G_VECREDUCE_ADD %1(<4 x s32>)
@@ -73,14 +73,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v2s64
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>)
-    ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(p0) = COPY $x0
     %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load (<2 x s64>))
     %2:_(s64) = G_VECREDUCE_ADD %1(<2 x s64>)
@@ -94,14 +94,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $x0
-
     ; CHECK-LABEL: name: add_v2s32
     ; CHECK: liveins: $x0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>)
-    ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32)
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(p0) = COPY $x0
     %1:_(<2 x s32>) = G_LOAD %0(p0) :: (load (<2 x s32>))
     %2:_(s32) = G_VECREDUCE_ADD %1(<2 x s32>)
@@ -111,24 +111,25 @@ body:             |
 ...
 ---
 name:            test_v8i64
+# This is a power-of-2 legalization, so use a tree reduction.
 alignment:       4
 tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0, $q1, $q2, $q3
-    ; This is a power-of-2 legalization, so use a tree reduction.
     ; CHECK-LABEL: name: test_v8i64
     ; CHECK: liveins: $q0, $q1, $q2, $q3
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
-    ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
-    ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
-    ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
-    ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
-    ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
+    ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
@@ -143,25 +144,26 @@ body:             |
 ...
 ---
 name:            test_v6i64
+# This is a non-power-of-2 legalization, generate multiple vector reductions
+# and combine them with scalar ops.
 alignment:       4
 tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0, $q1, $q2, $q3
-    ; This is a non-power-of-2 legalization, generate multiple vector reductions
-    ; and combine them with scalar ops.
     ; CHECK-LABEL: name: test_v6i64
     ; CHECK: liveins: $q0, $q1, $q2, $q3
-    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
-    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
-    ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
-    ; CHECK: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
-    ; CHECK: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
-    ; CHECK: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
-    ; CHECK: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
-    ; CHECK: $x0 = COPY [[ADD1]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
+    ; CHECK-NEXT: $x0 = COPY [[ADD1]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index def4192b0e005..aba284b4e0d29 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; Function Attrs: nounwind readnone
 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
 declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>)
 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
@@ -23,14 +22,14 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>)
 
-; GISEL:       warning: Instruction selection used fallback path for addv_v2i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
-; GISEL-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
+; CHECK-GI:       warning: Instruction selection used fallback path for addv_v2i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v4i8
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v2i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v3i64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for addv_v2i128
 
 
 define i8 @add_B(ptr %arr)  {
@@ -83,34 +82,34 @@ define i64 @add_D(ptr %arr)  {
 
 
 define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias nocapture readonly %arg2) {
-; SDAG-LABEL: oversized_ADDV_256:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    ldr d0, [x0]
-; SDAG-NEXT:    ldr d1, [x1]
-; SDAG-NEXT:    uabdl v0.8h, v0.8b, v1.8b
-; SDAG-NEXT:    uaddlv s0, v0.8h
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: oversized_ADDV_256:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1]
+; CHECK-SD-NEXT:    uabdl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: oversized_ADDV_256:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    ldr d1, [x0]
-; GISEL-NEXT:    ldr d2, [x1]
-; GISEL-NEXT:    movi v0.2d, #0000000000000000
-; GISEL-NEXT:    usubl v1.8h, v1.8b, v2.8b
-; GISEL-NEXT:    sshll v2.4s, v1.4h, #0
-; GISEL-NEXT:    sshll2 v3.4s, v1.8h, #0
-; GISEL-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
-; GISEL-NEXT:    cmlt v4.4s, v2.4s, #0
-; GISEL-NEXT:    cmlt v5.4s, v3.4s, #0
-; GISEL-NEXT:    neg v6.4s, v2.4s
-; GISEL-NEXT:    mov v1.16b, v4.16b
-; GISEL-NEXT:    bif v0.16b, v3.16b, v5.16b
-; GISEL-NEXT:    bsl v1.16b, v6.16b, v2.16b
-; GISEL-NEXT:    add v0.4s, v1.4s, v0.4s
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: oversized_ADDV_256:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    ldr d2, [x1]
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    cmlt v4.4s, v2.4s, #0
+; CHECK-GI-NEXT:    cmlt v5.4s, v3.4s, #0
+; CHECK-GI-NEXT:    neg v6.4s, v2.4s
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
+; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v5.16b
+; CHECK-GI-NEXT:    bsl v1.16b, v6.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <8 x i8>, ptr %arg1, align 1
   %1 = zext <8 x i8> %0 to <8 x i32>
@@ -127,48 +126,48 @@ entry:
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 
 define i32 @oversized_ADDV_512(ptr %arr)  {
-; SDAG-LABEL: oversized_ADDV_512:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldp q0, q1, [x0, #32]
-; SDAG-NEXT:    ldp q2, q3, [x0]
-; SDAG-NEXT:    add v1.4s, v3.4s, v1.4s
-; SDAG-NEXT:    add v0.4s, v2.4s, v0.4s
-; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
-; SDAG-NEXT:    addv s0, v0.4s
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: oversized_ADDV_512:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-SD-NEXT:    ldp q2, q3, [x0]
+; CHECK-SD-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-SD-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: oversized_ADDV_512:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldp q0, q1, [x0]
-; GISEL-NEXT:    ldp q2, q3, [x0, #32]
-; GISEL-NEXT:    add v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    add v1.4s, v2.4s, v3.4s
-; GISEL-NEXT:    add v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: oversized_ADDV_512:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldp q0, q1, [x0]
+; CHECK-GI-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %bin.rdx = load <16 x i32>, ptr %arr
   %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
   ret i32 %r
 }
 
 define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
-; SDAG-LABEL: addv_combine_i8:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.8b, v0.8b, v1.8b
-; SDAG-NEXT:    addv b0, v0.8b
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addv b0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i8:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv b0, v0.8b
-; GISEL-NEXT:    addv b1, v1.8b
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w9, w8, uxtb
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.8b
+; CHECK-GI-NEXT:    addv b1, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w9, w8, uxtb
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
   %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2)
@@ -177,21 +176,21 @@ entry:
 }
 
 define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
-; SDAG-LABEL: addv_combine_i16:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.4h, v0.4h, v1.4h
-; SDAG-NEXT:    addv h0, v0.4h
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    addv h0, v0.4h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i16:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv h0, v0.4h
-; GISEL-NEXT:    addv h1, v1.4h
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w9, w8, uxth
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv h0, v0.4h
+; CHECK-GI-NEXT:    addv h1, v1.4h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w9, w8, uxth
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
   %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2)
@@ -200,21 +199,21 @@ entry:
 }
 
 define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) {
-; SDAG-LABEL: addv_combine_i32:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
-; SDAG-NEXT:    addv s0, v0.4s
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i32:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addv s0, v0.4s
-; GISEL-NEXT:    addv s1, v1.4s
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    add w0, w8, w9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1)
   %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2)
@@ -223,21 +222,21 @@ entry:
 }
 
 define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) {
-; SDAG-LABEL: addv_combine_i64:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    add v0.2d, v0.2d, v1.2d
-; SDAG-NEXT:    addp d0, v0.2d
-; SDAG-NEXT:    fmov x0, d0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: addv_combine_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: addv_combine_i64:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    addp d0, v0.2d
-; GISEL-NEXT:    addp d1, v1.2d
-; GISEL-NEXT:    fmov x8, d0
-; GISEL-NEXT:    fmov x9, d1
-; GISEL-NEXT:    add x0, x8, x9
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: addv_combine_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    addp d1, v1.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
 entry:
   %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1)
   %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2)
@@ -471,3 +470,6 @@ entry:
   ret i128 %arg1
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
index b498611242d46..d69d1b6eb4a2a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc < %s -global-isel -global-isel-abort=1 -pass-remarks-missed=gisel* -mtriple=arm64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 %0 = type { i64, i64 }
 
@@ -39,22 +39,21 @@ declare i32 @llvm.aarch64.stxp(i64, i64, ptr) nounwind
 
 @var = dso_local global i64 0, align 8
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i8
 define dso_local void @test_load_i8(ptr %addr) {
-; SDAG-LABEL: test_load_i8:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxrb w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxrb w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i8:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxrb w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxrb w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i8) %addr)
   %shortval = trunc i64 %val to i8
@@ -63,22 +62,21 @@ define dso_local void @test_load_i8(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i16
 define dso_local void @test_load_i16(ptr %addr) {
-; SDAG-LABEL: test_load_i16:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxrh w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxrh w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i16:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxrh w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xffff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxrh w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xffff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i16) %addr)
   %shortval = trunc i64 %val to i16
@@ -87,22 +85,21 @@ define dso_local void @test_load_i16(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i32
 define dso_local void @test_load_i32(ptr %addr) {
-; SDAG-LABEL: test_load_i32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldxr w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldxr w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_i32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldxr w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    mov w9, w9
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldxr w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    mov w9, w9
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i32) %addr)
   %shortval = trunc i64 %val to i32
@@ -111,7 +108,6 @@ define dso_local void @test_load_i32(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_i64
 define dso_local void @test_load_i64(ptr %addr) {
 ; CHECK-LABEL: test_load_i64:
 ; CHECK:       // %bb.0:
@@ -128,7 +124,6 @@ define dso_local void @test_load_i64(ptr %addr) {
 
 declare i64 @llvm.aarch64.ldxr.p0(ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i8
 define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i8:
 ; CHECK:       // %bb.0:
@@ -140,7 +135,6 @@ define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i16
 define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i16:
 ; CHECK:       // %bb.0:
@@ -152,7 +146,6 @@ define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i32
 define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i32:
 ; CHECK:       // %bb.0:
@@ -163,7 +156,6 @@ define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_i64
 define dso_local i32 @test_store_i64(i32, i64 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_i64:
 ; CHECK:       // %bb.0:
@@ -219,22 +211,21 @@ entry:
 declare %0 @llvm.aarch64.ldaxp(ptr) nounwind
 declare i32 @llvm.aarch64.stlxp(i64, i64, ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i8
 define dso_local void @test_load_acquire_i8(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i8:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxrb w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxrb w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i8:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxrb w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxrb w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i8) %addr)
   %shortval = trunc i64 %val to i8
@@ -243,22 +234,21 @@ define dso_local void @test_load_acquire_i8(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i16
 define dso_local void @test_load_acquire_i16(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i16:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxrh w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxrh w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i16:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxrh w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    and x9, x9, #0xffff
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxrh w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    and x9, x9, #0xffff
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) %addr)
   %shortval = trunc i64 %val to i16
@@ -267,22 +257,21 @@ define dso_local void @test_load_acquire_i16(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i32
 define dso_local void @test_load_acquire_i32(ptr %addr) {
-; SDAG-LABEL: test_load_acquire_i32:
-; SDAG:       // %bb.0:
-; SDAG-NEXT:    ldaxr w8, [x0]
-; SDAG-NEXT:    adrp x9, var
-; SDAG-NEXT:    str x8, [x9, :lo12:var]
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: test_load_acquire_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldaxr w8, [x0]
+; CHECK-SD-NEXT:    adrp x9, var
+; CHECK-SD-NEXT:    str x8, [x9, :lo12:var]
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: test_load_acquire_i32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    ldaxr w9, [x0]
-; GISEL-NEXT:    adrp x8, var
-; GISEL-NEXT:    mov w9, w9
-; GISEL-NEXT:    str x9, [x8, :lo12:var]
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_load_acquire_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldaxr w9, [x0]
+; CHECK-GI-NEXT:    adrp x8, var
+; CHECK-GI-NEXT:    mov w9, w9
+; CHECK-GI-NEXT:    str x9, [x8, :lo12:var]
+; CHECK-GI-NEXT:    ret
 
   %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) %addr)
   %shortval = trunc i64 %val to i32
@@ -291,7 +280,6 @@ define dso_local void @test_load_acquire_i32(ptr %addr) {
   ret void
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i64
 define dso_local void @test_load_acquire_i64(ptr %addr) {
 ; CHECK-LABEL: test_load_acquire_i64:
 ; CHECK:       // %bb.0:
@@ -308,7 +296,6 @@ define dso_local void @test_load_acquire_i64(ptr %addr) {
 
 declare i64 @llvm.aarch64.ldaxr.p0(ptr) nounwind
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i8
 define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i8:
 ; CHECK:       // %bb.0:
@@ -320,7 +307,6 @@ define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i16
 define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i16:
 ; CHECK:       // %bb.0:
@@ -332,7 +318,6 @@ define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i32
 define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i32:
 ; CHECK:       // %bb.0:
@@ -343,7 +328,6 @@ define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) {
   ret i32 %res
 }
 
-; FALLBACK-NOT: remark:{{.*}}test_store_release_i64
 define dso_local i32 @test_store_release_i64(i32, i64 %val, ptr %addr) {
 ; CHECK-LABEL: test_store_release_i64:
 ; CHECK:       // %bb.0:
@@ -378,5 +362,3 @@ define dso_local i32 @test_stxp_undef_inline_asm(ptr %p, i64 %x) nounwind {
 }
 
 declare i32 @llvm.aarch64.stlxr.p0(i64, ptr) nounwind
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 7f3c1fdc93380..c9fe258f11556 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,86 +1,87 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG
-; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; Basic tests from input vector to bitmask
 ; IR generated from clang for:
 ; __builtin_convertvector + reinterpret_cast<uint16&>
 
-; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
-; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
-; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
+; CHECK-GI:       warning: Instruction selection used fallback path for convert_to_bitmask2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for convert_to_bitmask_2xi32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for convert_to_bitmask_8xi2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat
 
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; SDAG-LABEL: convert_to_bitmask16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI0_0@PAGE
-; SDAG-NEXT:    cmeq.16b v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask16:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI0_0@PAGE
+; CHECK-SD-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.16b v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask16:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[8]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[10]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #8
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[11]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #9
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[12]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #10
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[13]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #11
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[14]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #12
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[15]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #13
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #15
+; CHECK-GI-NEXT:    strh w8, [sp, #14]
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 ; Actual conversion
 
@@ -90,50 +91,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; SDAG-LABEL: convert_to_bitmask8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI1_0@PAGE
-; SDAG-NEXT:    cmeq.8h v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask8:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI1_0@PAGE
+; CHECK-SD-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI1_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask8:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
@@ -143,36 +144,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 }
 
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; SDAG-LABEL: convert_to_bitmask4:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, lCPI2_0@PAGE
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask4:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    adrp x8, lCPI2_0@PAGE
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI2_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask4:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask4:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
@@ -220,37 +221,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 
 
 define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_no_compare:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    adrp x8, lCPI5_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
-; SDAG-NEXT:    shl.4s v0, v0, #31
-; SDAG-NEXT:    cmlt.4s v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_no_compare:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    adrp x8, lCPI5_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI5_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4s v0, v0, #31
+; CHECK-SD-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_no_compare:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    and.16b v0, v0, v1
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_no_compare:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    and.16b v0, v0, v1
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp = and <4 x i32> %vec1, %vec2
@@ -260,39 +261,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 }
 
 define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_compare_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v2, v0, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, v1
-; SDAG-NEXT:    adrp x8, lCPI6_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v0, v2
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_compare_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-SD-NEXT:    adrp x8, lCPI6_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI6_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v0, v2
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_compare_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v2, v0, #0
-; GISEL-NEXT:    cmeq.4s v0, v0, v1
-; GISEL-NEXT:    bic.16b v0, v0, v2
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_compare_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-GI-NEXT:    bic.16b v0, v0, v2
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -303,39 +304,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 }
 
 define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI7_0@PAGE
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
-; SDAG-NEXT:    shl.4s v0, v0, #31
-; SDAG-NEXT:    cmlt.4s v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI7_0@PAGE
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI7_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4s v0, v0, #31
+; CHECK-SD-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    bic.16b v0, v1, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_trunc_in_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    bic.16b v0, v1, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -346,82 +347,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 }
 
 define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    adrp x8, lCPI8_0@PAGE
-; SDAG-NEXT:    movi d2, #0x000000ffffffff
-; SDAG-NEXT:    movi d3, #0x00ffffffffffff
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    movi d1, #0xffff0000ffff0000
-; SDAG-NEXT:    xtn.4h v0, v0
-; SDAG-NEXT:    orr.8b v0, v0, v2
-; SDAG-NEXT:    movi d2, #0x00ffffffff0000
-; SDAG-NEXT:    eor.8b v1, v0, v1
-; SDAG-NEXT:    eor.8b v0, v0, v2
-; SDAG-NEXT:    mov.h v1[2], wzr
-; SDAG-NEXT:    orr.8b v0, v0, v3
-; SDAG-NEXT:    orr.8b v0, v1, v0
-; SDAG-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
-; SDAG-NEXT:    shl.4h v0, v0, #15
-; SDAG-NEXT:    cmlt.4h v0, v0, #0
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI8_0@PAGE
+; CHECK-SD-NEXT:    movi d2, #0x000000ffffffff
+; CHECK-SD-NEXT:    movi d3, #0x00ffffffffffff
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    movi d1, #0xffff0000ffff0000
+; CHECK-SD-NEXT:    xtn.4h v0, v0
+; CHECK-SD-NEXT:    orr.8b v0, v0, v2
+; CHECK-SD-NEXT:    movi d2, #0x00ffffffff0000
+; CHECK-SD-NEXT:    eor.8b v1, v0, v1
+; CHECK-SD-NEXT:    eor.8b v0, v0, v2
+; CHECK-SD-NEXT:    mov.h v1[2], wzr
+; CHECK-SD-NEXT:    orr.8b v0, v0, v3
+; CHECK-SD-NEXT:    orr.8b v0, v1, v0
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI8_0@PAGEOFF]
+; CHECK-SD-NEXT:    shl.4h v0, v0, #15
+; CHECK-SD-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov w8, #1 ; =0x1
-; GISEL-NEXT:    mov w9, #0 ; =0x0
-; GISEL-NEXT:    cmeq.4s v5, v0, #0
-; GISEL-NEXT:    fmov s2, w8
-; GISEL-NEXT:    fmov s4, w9
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mov.16b v3, v2
-; GISEL-NEXT:    mov.16b v0, v4
-; GISEL-NEXT:    mov.h v4[1], w8
-; GISEL-NEXT:    bic.16b v1, v1, v5
-; GISEL-NEXT:    mov.16b v5, v2
-; GISEL-NEXT:    mov.h v2[1], w8
-; GISEL-NEXT:    mov.h v3[1], w8
-; GISEL-NEXT:    mov.h v0[1], w8
-; GISEL-NEXT:    mov.h v5[1], w8
-; GISEL-NEXT:    mov.h v4[2], w8
-; GISEL-NEXT:    xtn.4h v1, v1
-; GISEL-NEXT:    mov.h v2[2], w8
-; GISEL-NEXT:    mov.h v3[2], w9
-; GISEL-NEXT:    mov.h v0[2], w9
-; GISEL-NEXT:    mov.h v5[2], w9
-; GISEL-NEXT:    mov.h v4[3], w9
-; GISEL-NEXT:    mov.h v2[3], w9
-; GISEL-NEXT:    mov.h v3[3], w9
-; GISEL-NEXT:    mov.h v0[3], w8
-; GISEL-NEXT:    mov.h v5[3], w8
-; GISEL-NEXT:    orr.8b v1, v1, v3
-; GISEL-NEXT:    eor.8b v0, v1, v0
-; GISEL-NEXT:    eor.8b v1, v4, v1
-; GISEL-NEXT:    and.8b v0, v0, v5
-; GISEL-NEXT:    orr.8b v1, v2, v1
-; GISEL-NEXT:    orr.8b v0, v0, v1
-; GISEL-NEXT:    ushll.4s v0, v0, #0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w8, #1 ; =0x1
+; CHECK-GI-NEXT:    mov w9, #0 ; =0x0
+; CHECK-GI-NEXT:    cmeq.4s v5, v0, #0
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    mov.16b v3, v2
+; CHECK-GI-NEXT:    mov.16b v0, v4
+; CHECK-GI-NEXT:    mov.h v4[1], w8
+; CHECK-GI-NEXT:    bic.16b v1, v1, v5
+; CHECK-GI-NEXT:    mov.16b v5, v2
+; CHECK-GI-NEXT:    mov.h v2[1], w8
+; CHECK-GI-NEXT:    mov.h v3[1], w8
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    mov.h v5[1], w8
+; CHECK-GI-NEXT:    mov.h v4[2], w8
+; CHECK-GI-NEXT:    xtn.4h v1, v1
+; CHECK-GI-NEXT:    mov.h v2[2], w8
+; CHECK-GI-NEXT:    mov.h v3[2], w9
+; CHECK-GI-NEXT:    mov.h v0[2], w9
+; CHECK-GI-NEXT:    mov.h v5[2], w9
+; CHECK-GI-NEXT:    mov.h v4[3], w9
+; CHECK-GI-NEXT:    mov.h v2[3], w9
+; CHECK-GI-NEXT:    mov.h v3[3], w9
+; CHECK-GI-NEXT:    mov.h v0[3], w8
+; CHECK-GI-NEXT:    mov.h v5[3], w8
+; CHECK-GI-NEXT:    orr.8b v1, v1, v3
+; CHECK-GI-NEXT:    eor.8b v0, v1, v0
+; CHECK-GI-NEXT:    eor.8b v1, v4, v1
+; CHECK-GI-NEXT:    and.8b v0, v0, v5
+; CHECK-GI-NEXT:    orr.8b v1, v2, v1
+; CHECK-GI-NEXT:    orr.8b v0, v0, v1
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
@@ -440,42 +441,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 }
 
 define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4h v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI9_0@PAGE
-; SDAG-NEXT:    xtn.4h v1, v1
-; SDAG-NEXT:    orn.8b v0, v1, v0
-; SDAG-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI9_0@PAGE
+; CHECK-SD-NEXT:    xtn.4h v1, v1
+; CHECK-SD-NEXT:    orn.8b v0, v1, v0
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    cmeq.4h v0, v0, #0
-; GISEL-NEXT:    xtn.4h v1, v1
-; GISEL-NEXT:    orn.8b v0, v1, v0
-; GISEL-NEXT:    ushll.4s v0, v0, #0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_with_different_types_in_chain:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-GI-NEXT:    xtn.4h v1, v1
+; CHECK-GI-NEXT:    orn.8b v0, v1, v0
+; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
@@ -486,73 +487,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4
 }
 
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
-; SDAG-LABEL: convert_to_bitmask_without_knowing_type:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI10_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    shl.16b v0, v0, #7
+; CHECK-SD-NEXT:    adrp x8, lCPI10_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI10_0@PAGEOFF]
+; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_without_knowing_type:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[8]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[9]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[10]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #8
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[11]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #9
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[12]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #10
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[13]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #11
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[14]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #12
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[15]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #13
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #14
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #15
-; GISEL-NEXT:    strh w8, [sp, #14]
-; GISEL-NEXT:    and w0, w8, #0xffff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_without_knowing_type:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[8]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[9]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[10]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #8
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[11]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #9
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[12]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #10
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[13]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #11
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[14]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #12
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[15]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #13
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #14
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #15
+; CHECK-GI-NEXT:    strh w8, [sp, #14]
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
@@ -575,51 +576,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 }
 
 define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; SDAG-LABEL: convert_to_bitmask_4xi8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    bic.4h v0, #255, lsl #8
-; SDAG-NEXT:    adrp x8, lCPI12_0@PAGE
-; SDAG-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
-; SDAG-NEXT:    cmeq.4h v0, v0, #0
-; SDAG-NEXT:    bic.8b v0, v1, v0
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_4xi8:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-SD-NEXT:    adrp x8, lCPI12_0@PAGE
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI12_0@PAGEOFF]
+; CHECK-SD-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-SD-NEXT:    bic.8b v0, v1, v0
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_4xi8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov w8, #0 ; =0x0
-; GISEL-NEXT:    uzp1.8b v0, v0, v0
-; GISEL-NEXT:    fmov s1, w8
-; GISEL-NEXT:    mov.b v1[1], w8
-; GISEL-NEXT:    mov.b v1[2], w8
-; GISEL-NEXT:    mov.b v1[3], w8
-; GISEL-NEXT:    cmeq.8b v0, v0, v1
-; GISEL-NEXT:    mvn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[0]
-; GISEL-NEXT:    umov.b w9, v0[1]
-; GISEL-NEXT:    mov.s v1[0], w8
-; GISEL-NEXT:    umov.b w8, v0[2]
-; GISEL-NEXT:    mov.s v1[1], w9
-; GISEL-NEXT:    umov.b w9, v0[3]
-; GISEL-NEXT:    mov.s v1[2], w8
-; GISEL-NEXT:    mov.s v1[3], w9
-; GISEL-NEXT:    mov.s w8, v1[1]
-; GISEL-NEXT:    mov.s w9, v1[2]
-; GISEL-NEXT:    fmov w11, s1
-; GISEL-NEXT:    mov.s w10, v1[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_4xi8:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov w8, #0 ; =0x0
+; CHECK-GI-NEXT:    uzp1.8b v0, v0, v0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov.b v1[1], w8
+; CHECK-GI-NEXT:    mov.b v1[2], w8
+; CHECK-GI-NEXT:    mov.b v1[3], w8
+; CHECK-GI-NEXT:    cmeq.8b v0, v0, v1
+; CHECK-GI-NEXT:    mvn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[0]
+; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    mov.s v1[0], w8
+; CHECK-GI-NEXT:    umov.b w8, v0[2]
+; CHECK-GI-NEXT:    mov.s v1[1], w9
+; CHECK-GI-NEXT:    umov.b w9, v0[3]
+; CHECK-GI-NEXT:    mov.s v1[2], w8
+; CHECK-GI-NEXT:    mov.s v1[3], w9
+; CHECK-GI-NEXT:    mov.s w8, v1[1]
+; CHECK-GI-NEXT:    mov.s w9, v1[2]
+; CHECK-GI-NEXT:    fmov w11, s1
+; CHECK-GI-NEXT:    mov.s w10, v1[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -645,39 +646,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 }
 
 define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; SDAG-LABEL: convert_to_bitmask_float:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    fcmgt.4s v1, v0, #0.0
-; SDAG-NEXT:    fcmlt.4s v0, v0, #0.0
-; SDAG-NEXT:    adrp x8, lCPI14_0@PAGE
-; SDAG-NEXT:    orr.16b v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    addv.4s s0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_to_bitmask_float:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-SD-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-SD-NEXT:    adrp x8, lCPI14_0@PAGE
+; CHECK-SD-NEXT:    orr.16b v0, v0, v1
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI14_0@PAGEOFF]
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4s s0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_to_bitmask_float:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    fcmgt.4s v1, v0, #0.0
-; GISEL-NEXT:    fcmlt.4s v0, v0, #0.0
-; GISEL-NEXT:    orr.16b v0, v0, v1
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_to_bitmask_float:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-GI-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-GI-NEXT:    orr.16b v0, v0, v1
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
@@ -688,58 +689,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; SDAG-LABEL: convert_large_vector:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    adrp x8, lCPI15_0@PAGE
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
-; SDAG-NEXT:    bic.16b v0, v1, v0
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    fmov w8, s0
-; SDAG-NEXT:    and w0, w8, #0xff
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_large_vector:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    adrp x8, lCPI15_0@PAGE
+; CHECK-SD-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI15_0@PAGEOFF]
+; CHECK-SD-NEXT:    bic.16b v0, v1, v0
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_large_vector:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    cmeq.4s v1, v1, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    uzp1.8h v0, v0, v1
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w10, v0[2]
-; GISEL-NEXT:    umov.b w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    umov.b w10, v0[6]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #6
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_large_vector:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mvn.16b v1, v1
+; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #6
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
@@ -748,40 +749,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
 }
 
 define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; SDAG-LABEL: convert_legalized_illegal_element_size:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    movi.4s v1, #63, msl #16
-; SDAG-NEXT:    adrp x8, lCPI16_0@PAGE
-; SDAG-NEXT:    cmtst.4s v0, v0, v1
-; SDAG-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
-; SDAG-NEXT:    xtn.4h v0, v0
-; SDAG-NEXT:    and.8b v0, v0, v1
-; SDAG-NEXT:    addv.4h h0, v0
-; SDAG-NEXT:    fmov w0, s0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: convert_legalized_illegal_element_size:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-SD-NEXT:    adrp x8, lCPI16_0@PAGE
+; CHECK-SD-NEXT:    cmtst.4s v0, v0, v1
+; CHECK-SD-NEXT:    ldr d1, [x8, lCPI16_0@PAGEOFF]
+; CHECK-SD-NEXT:    xtn.4h v0, v0
+; CHECK-SD-NEXT:    and.8b v0, v0, v1
+; CHECK-SD-NEXT:    addv.4h h0, v0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: convert_legalized_illegal_element_size:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    movi.4s v1, #63, msl #16
-; GISEL-NEXT:    and.16b v0, v0, v1
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    fmov w11, s0
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w11, w8, #1, #31
-; GISEL-NEXT:    and w8, w9, #0x1
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w11, w8, lsl #2
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: convert_legalized_illegal_element_size:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    movi.4s v1, #63, msl #16
+; CHECK-GI-NEXT:    and.16b v0, v0, v1
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    fmov w11, s0
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -818,101 +819,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 }
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
-; SDAG-LABEL: no_convert_without_direct_bitcast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    cmtst.8h v0, v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: no_convert_without_direct_bitcast:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    cmtst.8h v0, v0, v0
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: no_convert_without_direct_bitcast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    cmeq.8h v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    xtn.8b v0, v0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: no_convert_without_direct_bitcast:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    xtn.8b v0, v0
+; CHECK-GI-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
 }
 
 define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
-; SDAG-LABEL: no_combine_illegal_num_elements:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    .cfi_def_cfa_offset 16
-; SDAG-NEXT:    fmov s0, w0
-; SDAG-NEXT:    fmov s1, w4
-; SDAG-NEXT:    mov.s v0[1], w1
-; SDAG-NEXT:    mov.s v1[1], w5
-; SDAG-NEXT:    mov.s v0[2], w2
-; SDAG-NEXT:    cmeq.4s v1, v1, #0
-; SDAG-NEXT:    mov.s v0[3], w3
-; SDAG-NEXT:    cmeq.4s v0, v0, #0
-; SDAG-NEXT:    uzp1.8h v0, v0, v1
-; SDAG-NEXT:    mvn.16b v0, v0
-; SDAG-NEXT:    xtn.8b v0, v0
-; SDAG-NEXT:    umov.b w8, v0[0]
-; SDAG-NEXT:    umov.b w9, v0[1]
-; SDAG-NEXT:    umov.b w10, v0[2]
-; SDAG-NEXT:    and w8, w8, #0x1
-; SDAG-NEXT:    bfi w8, w9, #1, #1
-; SDAG-NEXT:    umov.b w9, v0[3]
-; SDAG-NEXT:    bfi w8, w10, #2, #1
-; SDAG-NEXT:    umov.b w10, v0[4]
-; SDAG-NEXT:    bfi w8, w9, #3, #1
-; SDAG-NEXT:    umov.b w9, v0[5]
-; SDAG-NEXT:    bfi w8, w10, #4, #1
-; SDAG-NEXT:    orr w8, w8, w9, lsl #5
-; SDAG-NEXT:    and w0, w8, #0x3f
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: no_combine_illegal_num_elements:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w4
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    mov.s v1[1], w5
+; CHECK-SD-NEXT:    mov.s v0[2], w2
+; CHECK-SD-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-SD-NEXT:    mov.s v0[3], w3
+; CHECK-SD-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-SD-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-SD-NEXT:    mvn.16b v0, v0
+; CHECK-SD-NEXT:    xtn.8b v0, v0
+; CHECK-SD-NEXT:    umov.b w8, v0[0]
+; CHECK-SD-NEXT:    umov.b w9, v0[1]
+; CHECK-SD-NEXT:    umov.b w10, v0[2]
+; CHECK-SD-NEXT:    and w8, w8, #0x1
+; CHECK-SD-NEXT:    bfi w8, w9, #1, #1
+; CHECK-SD-NEXT:    umov.b w9, v0[3]
+; CHECK-SD-NEXT:    bfi w8, w10, #2, #1
+; CHECK-SD-NEXT:    umov.b w10, v0[4]
+; CHECK-SD-NEXT:    bfi w8, w9, #3, #1
+; CHECK-SD-NEXT:    umov.b w9, v0[5]
+; CHECK-SD-NEXT:    bfi w8, w10, #4, #1
+; CHECK-SD-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-SD-NEXT:    and w0, w8, #0x3f
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: no_combine_illegal_num_elements:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    .cfi_def_cfa_offset 16
-; GISEL-NEXT:    mov.s v0[0], w0
-; GISEL-NEXT:    mov.s v1[0], w4
-; GISEL-NEXT:    mov.s v2[0], wzr
-; GISEL-NEXT:    mov.s v0[1], w1
-; GISEL-NEXT:    mov.s v1[1], w5
-; GISEL-NEXT:    mov.s v2[1], wzr
-; GISEL-NEXT:    mov.s v0[2], w2
-; GISEL-NEXT:    cmeq.4s v1, v1, v2
-; GISEL-NEXT:    mvn.16b v1, v1
-; GISEL-NEXT:    mov.s v0[3], w3
-; GISEL-NEXT:    cmeq.4s v0, v0, #0
-; GISEL-NEXT:    mvn.16b v0, v0
-; GISEL-NEXT:    mov.s w8, v0[1]
-; GISEL-NEXT:    mov.s w9, v0[2]
-; GISEL-NEXT:    mov.s w10, v0[3]
-; GISEL-NEXT:    mov.h v0[1], w8
-; GISEL-NEXT:    mov.s w8, v1[1]
-; GISEL-NEXT:    mov.h v0[2], w9
-; GISEL-NEXT:    mov.h v0[3], w10
-; GISEL-NEXT:    mov.h v0[4], v1[0]
-; GISEL-NEXT:    mov.h v0[5], w8
-; GISEL-NEXT:    umov.h w8, v0[1]
-; GISEL-NEXT:    umov.h w9, v0[0]
-; GISEL-NEXT:    umov.h w10, v0[2]
-; GISEL-NEXT:    umov.h w11, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    umov.h w10, v0[4]
-; GISEL-NEXT:    orr w8, w9, w8, lsl #2
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    umov.h w11, v0[5]
-; GISEL-NEXT:    orr w8, w8, w9, lsl #3
-; GISEL-NEXT:    and w9, w10, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #4
-; GISEL-NEXT:    and w9, w11, #0x1
-; GISEL-NEXT:    orr w8, w8, w9, lsl #5
-; GISEL-NEXT:    and w8, w8, #0x3f
-; GISEL-NEXT:    strb w8, [sp, #15]
-; GISEL-NEXT:    and w0, w8, #0xff
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: no_combine_illegal_num_elements:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v1[0], w4
+; CHECK-GI-NEXT:    mov.s v2[0], wzr
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    mov.s v1[1], w5
+; CHECK-GI-NEXT:    mov.s v2[1], wzr
+; CHECK-GI-NEXT:    mov.s v0[2], w2
+; CHECK-GI-NEXT:    cmeq.4s v1, v1, v2
+; CHECK-GI-NEXT:    mvn.16b v1, v1
+; CHECK-GI-NEXT:    mov.s v0[3], w3
+; CHECK-GI-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-GI-NEXT:    mvn.16b v0, v0
+; CHECK-GI-NEXT:    mov.s w8, v0[1]
+; CHECK-GI-NEXT:    mov.s w9, v0[2]
+; CHECK-GI-NEXT:    mov.s w10, v0[3]
+; CHECK-GI-NEXT:    mov.h v0[1], w8
+; CHECK-GI-NEXT:    mov.s w8, v1[1]
+; CHECK-GI-NEXT:    mov.h v0[2], w9
+; CHECK-GI-NEXT:    mov.h v0[3], w10
+; CHECK-GI-NEXT:    mov.h v0[4], v1[0]
+; CHECK-GI-NEXT:    mov.h v0[5], w8
+; CHECK-GI-NEXT:    umov.h w8, v0[1]
+; CHECK-GI-NEXT:    umov.h w9, v0[0]
+; CHECK-GI-NEXT:    umov.h w10, v0[2]
+; CHECK-GI-NEXT:    umov.h w11, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    umov.h w10, v0[4]
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    umov.h w11, v0[5]
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
+; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-GI-NEXT:    and w8, w8, #0x3f
+; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
 
   %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
   %bitmask = bitcast <6 x i1> %cmp_result to i6
@@ -921,220 +922,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 
 ; Only apply the combine when casting a vector to a scalar.
 define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
-; SDAG-LABEL: vector_to_vector_cast:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #16
-; SDAG-NEXT:    shl.16b v0, v0, #7
-; SDAG-NEXT:    adrp x8, lCPI20_0@PAGE
-; SDAG-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; SDAG-NEXT:    add x8, sp, #14
-; SDAG-NEXT:    cmlt.16b v0, v0, #0
-; SDAG-NEXT:    and.16b v0, v0, v1
-; SDAG-NEXT:    ext.16b v1, v0, v0, #8
-; SDAG-NEXT:    zip1.16b v0, v0, v1
-; SDAG-NEXT:    addv.8h h0, v0
-; SDAG-NEXT:    str h0, [sp, #14]
-; SDAG-NEXT:    ld1.b { v0 }[0], [x8]
-; SDAG-NEXT:    orr x8, x8, #0x1
-; SDAG-NEXT:    ld1.b { v0 }[4], [x8]
-; SDAG-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; SDAG-NEXT:    add sp, sp, #16
-; SDAG-NEXT:    ret
+; CHECK-SD-LABEL: vector_to_vector_cast:
+; CHECK-SD:       ; %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    shl.16b v0, v0, #7
+; CHECK-SD-NEXT:    adrp x8, lCPI20_0@PAGE
+; CHECK-SD-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
+; CHECK-SD-NEXT:    add x8, sp, #14
+; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-SD-NEXT:    and.16b v0, v0, v1
+; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
+; CHECK-SD-NEXT:    addv.8h h0, v0
+; CHECK-SD-NEXT:    str h0, [sp, #14]
+; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x8]
+; CHECK-SD-NEXT:    orr x8, x8, #0x1
+; CHECK-SD-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
 ;
-; GISEL-LABEL: vector_to_vector_cast:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    sub sp, sp, #16
-; GISEL-NEXT:    umov.b w8, v0[1]
-; GISEL-NEXT:    mov d1, v0[1]
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    umov.b w9, v0[0]
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    umov.b w14, v0[2]
-; GISEL-NEXT:    umov.b w15, v0[3]
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    umov.b w17, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[3]
-; GISEL-NEXT:    and w8, w8, #0x1
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w0, v1[1]
-; GISEL-NEXT:    bfi w9, w8, #1, #31
-; GISEL-NEXT:    bfi w13, w10, #1, #31
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w8, v1[0]
-; GISEL-NEXT:    umov.b w10, v1[2]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w13, w13, w14, lsl #2
-; GISEL-NEXT:    umov.b w14, v1[3]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #2
-; GISEL-NEXT:    orr w13, w13, w15, lsl #3
-; GISEL-NEXT:    umov.b w15, v1[4]
-; GISEL-NEXT:    umov.b w11, v0[6]
-; GISEL-NEXT:    bfi w8, w0, #1, #31
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w17, #0x1
-; GISEL-NEXT:    orr w13, w13, w16, lsl #4
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #2
-; GISEL-NEXT:    umov.b w10, v1[5]
-; GISEL-NEXT:    umov.b w16, v1[6]
-; GISEL-NEXT:    orr w13, w13, w17, lsl #5
-; GISEL-NEXT:    umov.b w17, v0[4]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w8, w8, w14, lsl #3
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v1[7]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #3
-; GISEL-NEXT:    orr w11, w13, w11, lsl #6
-; GISEL-NEXT:    orr w8, w8, w15, lsl #4
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w0, w0, #0x1
-; GISEL-NEXT:    and w12, w17, #0x1
-; GISEL-NEXT:    umov.b w13, v0[1]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #5
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w12, lsl #4
-; GISEL-NEXT:    umov.b w10, v0[0]
-; GISEL-NEXT:    orr w11, w11, w0, lsl #7
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w16, lsl #6
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    umov.b w12, v0[6]
-; GISEL-NEXT:    strb w11, [sp, #8]
-; GISEL-NEXT:    and w11, w13, #0x1
-; GISEL-NEXT:    umov.b w13, v0[3]
-; GISEL-NEXT:    orr w8, w8, w14, lsl #7
-; GISEL-NEXT:    umov.b w14, v0[7]
-; GISEL-NEXT:    ldr b0, [sp, #8]
-; GISEL-NEXT:    bfi w10, w11, #1, #31
-; GISEL-NEXT:    and w11, w15, #0x1
-; GISEL-NEXT:    strb w8, [sp, #9]
-; GISEL-NEXT:    umov.b w15, v0[4]
-; GISEL-NEXT:    and w8, w12, #0x1
-; GISEL-NEXT:    orr w10, w10, w11, lsl #2
-; GISEL-NEXT:    orr w8, w9, w8, lsl #6
-; GISEL-NEXT:    and w9, w13, #0x1
-; GISEL-NEXT:    umov.b w11, v0[1]
-; GISEL-NEXT:    orr w9, w10, w9, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[5]
-; GISEL-NEXT:    umov.b w12, v0[0]
-; GISEL-NEXT:    and w13, w14, #0x1
-; GISEL-NEXT:    umov.b w16, v0[2]
-; GISEL-NEXT:    umov.b w17, v0[3]
-; GISEL-NEXT:    and w14, w15, #0x1
-; GISEL-NEXT:    umov.b w15, v0[2]
-; GISEL-NEXT:    orr w8, w8, w13, lsl #7
-; GISEL-NEXT:    orr w9, w9, w14, lsl #4
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    umov.b w14, v0[3]
-; GISEL-NEXT:    strb w8, [sp, #10]
-; GISEL-NEXT:    and w8, w10, #0x1
-; GISEL-NEXT:    bfi w12, w11, #1, #31
-; GISEL-NEXT:    orr w8, w9, w8, lsl #5
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    and w9, w15, #0x1
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    orr w9, w12, w9, lsl #2
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w8, w8, w13, lsl #6
-; GISEL-NEXT:    umov.b w13, v0[0]
-; GISEL-NEXT:    orr w9, w9, w14, lsl #3
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    umov.b w0, v0[3]
-; GISEL-NEXT:    orr w9, w9, w10, lsl #4
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    umov.b w12, v0[7]
-; GISEL-NEXT:    orr w8, w8, w11, lsl #7
-; GISEL-NEXT:    bfi w13, w15, #1, #31
-; GISEL-NEXT:    and w11, w16, #0x1
-; GISEL-NEXT:    orr w9, w9, w10, lsl #5
-; GISEL-NEXT:    and w10, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[4]
-; GISEL-NEXT:    strb w8, [sp, #11]
-; GISEL-NEXT:    umov.b w15, v0[1]
-; GISEL-NEXT:    umov.b w16, v0[3]
-; GISEL-NEXT:    orr w8, w9, w10, lsl #6
-; GISEL-NEXT:    orr w9, w13, w11, lsl #2
-; GISEL-NEXT:    and w10, w12, #0x1
-; GISEL-NEXT:    and w11, w17, #0x1
-; GISEL-NEXT:    umov.b w12, v0[5]
-; GISEL-NEXT:    umov.b w17, v0[0]
-; GISEL-NEXT:    orr w8, w8, w10, lsl #7
-; GISEL-NEXT:    orr w9, w9, w11, lsl #3
-; GISEL-NEXT:    umov.b w10, v0[1]
-; GISEL-NEXT:    and w11, w14, #0x1
-; GISEL-NEXT:    umov.b w14, v0[0]
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #4
-; GISEL-NEXT:    umov.b w11, v0[2]
-; GISEL-NEXT:    umov.b w13, v0[6]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    bfi w17, w15, #1, #31
-; GISEL-NEXT:    umov.b w15, v0[5]
-; GISEL-NEXT:    orr w9, w9, w12, lsl #5
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    umov.b w12, v0[2]
-; GISEL-NEXT:    bfi w14, w10, #1, #31
-; GISEL-NEXT:    umov.b w10, v0[4]
-; GISEL-NEXT:    ldr b1, [sp, #9]
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w13, #0x1
-; GISEL-NEXT:    strb w8, [sp, #12]
-; GISEL-NEXT:    orr w11, w14, w11, lsl #2
-; GISEL-NEXT:    and w14, w16, #0x1
-; GISEL-NEXT:    umov.b w16, v0[4]
-; GISEL-NEXT:    and w12, w12, #0x1
-; GISEL-NEXT:    and w15, w15, #0x1
-; GISEL-NEXT:    orr w9, w9, w13, lsl #6
-; GISEL-NEXT:    orr w11, w11, w14, lsl #3
-; GISEL-NEXT:    orr w12, w17, w12, lsl #2
-; GISEL-NEXT:    and w10, w10, #0x1
-; GISEL-NEXT:    and w17, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[5]
-; GISEL-NEXT:    umov.b w14, v0[6]
-; GISEL-NEXT:    orr w10, w11, w10, lsl #4
-; GISEL-NEXT:    orr w12, w12, w17, lsl #3
-; GISEL-NEXT:    umov.b w11, v0[7]
-; GISEL-NEXT:    and w16, w16, #0x1
-; GISEL-NEXT:    umov.b w17, v0[6]
-; GISEL-NEXT:    orr w10, w10, w15, lsl #5
-; GISEL-NEXT:    umov.b w15, v0[7]
-; GISEL-NEXT:    orr w12, w12, w16, lsl #4
-; GISEL-NEXT:    and w16, w0, #0x1
-; GISEL-NEXT:    umov.b w0, v0[7]
-; GISEL-NEXT:    and w14, w14, #0x1
-; GISEL-NEXT:    orr w12, w12, w16, lsl #5
-; GISEL-NEXT:    orr w10, w10, w14, lsl #6
-; GISEL-NEXT:    and w11, w11, #0x1
-; GISEL-NEXT:    and w13, w17, #0x1
-; GISEL-NEXT:    orr w9, w9, w11, lsl #7
-; GISEL-NEXT:    mov.s v0[1], v1[0]
-; GISEL-NEXT:    orr w11, w12, w13, lsl #6
-; GISEL-NEXT:    and w12, w15, #0x1
-; GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; GISEL-NEXT:    orr w8, w10, w12, lsl #7
-; GISEL-NEXT:    and w10, w0, #0x1
-; GISEL-NEXT:    strb w9, [sp, #13]
-; GISEL-NEXT:    orr w9, w11, w10, lsl #7
-; GISEL-NEXT:    strb w8, [sp, #14]
-; GISEL-NEXT:    strb w9, [sp, #15]
-; GISEL-NEXT:    add sp, sp, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: vector_to_vector_cast:
+; CHECK-GI:       ; %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #16
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    mov d1, v0[1]
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    umov.b w14, v0[2]
+; CHECK-GI-NEXT:    umov.b w15, v0[3]
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    umov.b w17, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v1[1]
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w8, v1[0]
+; CHECK-GI-NEXT:    umov.b w10, v1[2]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
+; CHECK-GI-NEXT:    umov.b w14, v1[3]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
+; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[4]
+; CHECK-GI-NEXT:    umov.b w11, v0[6]
+; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w17, w17, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
+; CHECK-GI-NEXT:    umov.b w10, v1[5]
+; CHECK-GI-NEXT:    umov.b w16, v1[6]
+; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
+; CHECK-GI-NEXT:    umov.b w17, v0[4]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
+; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w12, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[1]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
+; CHECK-GI-NEXT:    umov.b w10, v0[0]
+; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    umov.b w12, v0[6]
+; CHECK-GI-NEXT:    strb w11, [sp, #8]
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[3]
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
+; CHECK-GI-NEXT:    umov.b w14, v0[7]
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
+; CHECK-GI-NEXT:    and w11, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #9]
+; CHECK-GI-NEXT:    umov.b w15, v0[4]
+; CHECK-GI-NEXT:    and w8, w12, #0x1
+; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
+; CHECK-GI-NEXT:    and w9, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[1]
+; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[0]
+; CHECK-GI-NEXT:    and w13, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[2]
+; CHECK-GI-NEXT:    umov.b w17, v0[3]
+; CHECK-GI-NEXT:    and w14, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[3]
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
+; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
+; CHECK-GI-NEXT:    and w11, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
+; CHECK-GI-NEXT:    and w10, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[4]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    umov.b w16, v0[3]
+; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
+; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    umov.b w17, v0[0]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[0]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w13, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
+; CHECK-GI-NEXT:    and w14, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w17, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[5]
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
+; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w17, v0[6]
+; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
+; CHECK-GI-NEXT:    umov.b w15, v0[7]
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
+; CHECK-GI-NEXT:    and w16, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
+; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w17, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
+; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
+; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
+; CHECK-GI-NEXT:    and w10, w0, #0x1
+; CHECK-GI-NEXT:    strb w9, [sp, #13]
+; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
+; CHECK-GI-NEXT:    strb w8, [sp, #14]
+; CHECK-GI-NEXT:    strb w9, [sp, #15]
+; CHECK-GI-NEXT:    add sp, sp, #16
+; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
   ret <2 x i8> %bc
 }