From 72bfa28c07c810112da0778f504b91e87ab63600 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 24 Nov 2025 12:32:17 +0000
Subject: [PATCH 01/19] [X86] avx2-builtins.c - fix copy+paste typo in
 _mm256_cmpeq_epi8 constexpr test - still tested _mm_cmpeq_epi8 (#169311)

---
 clang/test/CodeGen/X86/avx2-builtins.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 13ad0545ab53f..6a884e98e9f3b 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -321,10 +321,10 @@ __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
   // CHECK: icmp eq <32 x i8>
   return _mm256_cmpeq_epi8(a, b);
 }
-TEST_CONSTEXPR(match_v16qi(_mm_cmpeq_epi8(
-    (__m128i)(__v16qs){1,-2,3,-4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16},
-    (__m128i)(__v16qs){10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16}),
-    0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1));
+TEST_CONSTEXPR(match_v32qi(_mm256_cmpeq_epi8(
+    (__m256i)(__v32qs){1,-2,3,-4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-16,15,-14,13,-12,11,-10,9,-8,7,-6,5,4,-3,2,-1},
+    (__m256i)(__v32qs){10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16,10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16}),
+    0, -1, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi16

From 74f5548bbc916a6c23731561f3808e64633760c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Mon, 24 Nov 2025 13:42:20 +0100
Subject: [PATCH 02/19] [HLSL][SPIR-V] Implements SV_Position for VS/PS I/O
 (#168735)

Current implementation for SV_Position was very basic to allow
implementing/testing some semantics. Now that semantic support is more
robust, I can move forward and implement the whole semantic logic.

DX part is still a bit placeholder.
---
 clang/include/clang/Sema/SemaHLSL.h           | 11 +++--
 clang/lib/CodeGen/CGHLSLRuntime.cpp           | 41 ++++++++++++++-----
 clang/lib/Sema/SemaHLSL.cpp                   | 35 +++++++++-------
 .../HLSL/semantic-input-struct-shadow.hlsl    | 21 ++++++++++
 .../test/AST/HLSL/semantic-input-struct.hlsl  | 20 +++++++++
 clang/test/AST/HLSL/semantic-input.hlsl       |  9 ++++
 .../HLSL/semantic-output-struct-shadow.hlsl   | 23 +++++++++++
 .../test/AST/HLSL/semantic-output-struct.hlsl | 22 ++++++++++
 clang/test/AST/HLSL/semantic-output.hlsl      |  9 ++++
 .../CodeGenHLSL/semantics/SV_Position.ps.hlsl | 20 ++++++---
 .../CodeGenHLSL/semantics/SV_Position.vs.hlsl | 26 ++++++++++++
 .../test/SemaHLSL/Semantics/position.ps.hlsl  | 14 ++-----
 .../test/SemaHLSL/Semantics/position.vs.hlsl  |  6 ---
 .../CodeGen/SPIRV/semantics/position.ps.ll    | 32 +++++++++++++++
 .../CodeGen/SPIRV/semantics/position.vs.ll    | 31 ++++++++++++++
 15 files changed, 271 insertions(+), 49 deletions(-)
 create mode 100644 clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl
 create mode 100644 clang/test/AST/HLSL/semantic-input-struct.hlsl
 create mode 100644 clang/test/AST/HLSL/semantic-input.hlsl
 create mode 100644 clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl
 create mode 100644 clang/test/AST/HLSL/semantic-output-struct.hlsl
 create mode 100644 clang/test/AST/HLSL/semantic-output.hlsl
 create mode 100644 clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl
 delete mode 100644 clang/test/SemaHLSL/Semantics/position.vs.hlsl
 create mode 100644 llvm/test/CodeGen/SPIRV/semantics/position.ps.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/semantics/position.vs.ll

diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 86da323892f98..15edb7e77a22b 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -250,15 +250,20 @@ class SemaHLSL : public SemaBase {
                                                const RecordType *RT);
 
   void checkSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param,
-                               const HLSLAppliedSemanticAttr *SemanticAttr);
+                               const HLSLAppliedSemanticAttr *SemanticAttr,
+                               bool IsInput);
+
   bool determineActiveSemanticOnScalar(FunctionDecl *FD,
                                        DeclaratorDecl *OutputDecl,
                                        DeclaratorDecl *D,
                                        SemanticInfo &ActiveSemantic,
-                                       llvm::StringSet<> &ActiveInputSemantics);
+                                       llvm::StringSet<> &ActiveSemantics,
+                                       bool IsInput);
+
   bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *OutputDecl,
                                DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
-                               llvm::StringSet<> &ActiveInputSemantics);
+                               llvm::StringSet<> &ActiveSemantics,
+                               bool IsInput);
 
   void processExplicitBindingsOnDecl(VarDecl *D);
 
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 2a5f3f6895609..f5c07fe2e33ff 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -731,13 +731,22 @@ llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad(
   }
 
   if (SemanticName == "SV_POSITION") {
-    if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Pixel)
-      return createSPIRVBuiltinLoad(B, CGM.getModule(), Type,
-                                    Semantic->getAttrName()->getName(),
-                                    /* BuiltIn::FragCoord */ 15);
+    if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Pixel) {
+      if (CGM.getTarget().getTriple().isSPIRV())
+        return createSPIRVBuiltinLoad(B, CGM.getModule(), Type,
+                                      Semantic->getAttrName()->getName(),
+                                      /* BuiltIn::FragCoord */ 15);
+      if (CGM.getTarget().getTriple().isDXIL())
+        return emitDXILUserSemanticLoad(B, Type, Semantic, Index);
+    }
+
+    if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Vertex) {
+      return emitUserSemanticLoad(B, Type, Decl, Semantic, Index);
+    }
   }
 
-  llvm_unreachable("non-handled system semantic. FIXME.");
+  llvm_unreachable(
+      "Load hasn't been implemented yet for this system semantic. FIXME");
 }
 
 static void createSPIRVBuiltinStore(IRBuilder<> &B, llvm::Module &M,
@@ -760,12 +769,22 @@ void CGHLSLRuntime::emitSystemSemanticStore(IRBuilder<> &B, llvm::Value *Source,
                                             std::optional<unsigned> Index) {
 
   std::string SemanticName = Semantic->getAttrName()->getName().upper();
-  if (SemanticName == "SV_POSITION")
-    createSPIRVBuiltinStore(B, CGM.getModule(), Source,
-                            Semantic->getAttrName()->getName(),
-                            /* BuiltIn::Position */ 0);
-  else
-    llvm_unreachable("non-handled system semantic. FIXME.");
+  if (SemanticName == "SV_POSITION") {
+    if (CGM.getTarget().getTriple().isDXIL()) {
+      emitDXILUserSemanticStore(B, Source, Semantic, Index);
+      return;
+    }
+
+    if (CGM.getTarget().getTriple().isSPIRV()) {
+      createSPIRVBuiltinStore(B, CGM.getModule(), Source,
+                              Semantic->getAttrName()->getName(),
+                              /* BuiltIn::Position */ 0);
+      return;
+    }
+  }
+
+  llvm_unreachable(
+      "Store hasn't been implemented yet for this system semantic. FIXME");
 }
 
 llvm::Value *CGHLSLRuntime::handleScalarSemanticLoad(
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 0a164a7b5bbbd..ecab3946b58c7 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -771,9 +771,12 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) {
   }
 }
 
-bool SemaHLSL::determineActiveSemanticOnScalar(
-    FunctionDecl *FD, DeclaratorDecl *OutputDecl, DeclaratorDecl *D,
-    SemanticInfo &ActiveSemantic, llvm::StringSet<> &UsedSemantics) {
+bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
+                                               DeclaratorDecl *OutputDecl,
+                                               DeclaratorDecl *D,
+                                               SemanticInfo &ActiveSemantic,
+                                               llvm::StringSet<> &UsedSemantics,
+                                               bool IsInput) {
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLParsedSemanticAttr>();
     if (ActiveSemantic.Semantic)
@@ -792,7 +795,7 @@ bool SemaHLSL::determineActiveSemanticOnScalar(
   if (!A)
     return false;
 
-  checkSemanticAnnotation(FD, D, A);
+  checkSemanticAnnotation(FD, D, A, IsInput);
   OutputDecl->addAttr(A);
 
   unsigned Location = ActiveSemantic.Index.value_or(0);
@@ -820,7 +823,8 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD,
                                        DeclaratorDecl *OutputDecl,
                                        DeclaratorDecl *D,
                                        SemanticInfo &ActiveSemantic,
-                                       llvm::StringSet<> &UsedSemantics) {
+                                       llvm::StringSet<> &UsedSemantics,
+                                       bool IsInput) {
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLParsedSemanticAttr>();
     if (ActiveSemantic.Semantic)
@@ -833,12 +837,13 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD,
   const RecordType *RT = dyn_cast<RecordType>(T);
   if (!RT)
     return determineActiveSemanticOnScalar(FD, OutputDecl, D, ActiveSemantic,
-                                           UsedSemantics);
+                                           UsedSemantics, IsInput);
 
   const RecordDecl *RD = RT->getDecl();
   for (FieldDecl *Field : RD->fields()) {
     SemanticInfo Info = ActiveSemantic;
-    if (!determineActiveSemantic(FD, OutputDecl, Field, Info, UsedSemantics)) {
+    if (!determineActiveSemantic(FD, OutputDecl, Field, Info, UsedSemantics,
+                                 IsInput)) {
       Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field;
       return false;
     }
@@ -920,7 +925,7 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
 
     // FIXME: Verify output semantics in parameters.
     if (!determineActiveSemantic(FD, Param, Param, ActiveSemantic,
-                                 ActiveInputSemantics)) {
+                                 ActiveInputSemantics, /* IsInput= */ true)) {
       Diag(Param->getLocation(), diag::note_previous_decl) << Param;
       FD->setInvalidDecl();
     }
@@ -932,12 +937,13 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
   if (ActiveSemantic.Semantic)
     ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex();
   if (!FD->getReturnType()->isVoidType())
-    determineActiveSemantic(FD, FD, FD, ActiveSemantic, ActiveOutputSemantics);
+    determineActiveSemantic(FD, FD, FD, ActiveSemantic, ActiveOutputSemantics,
+                            /* IsInput= */ false);
 }
 
 void SemaHLSL::checkSemanticAnnotation(
     FunctionDecl *EntryPoint, const Decl *Param,
-    const HLSLAppliedSemanticAttr *SemanticAttr) {
+    const HLSLAppliedSemanticAttr *SemanticAttr, bool IsInput) {
   auto *ShaderAttr = EntryPoint->getAttr<HLSLShaderAttr>();
   assert(ShaderAttr && "Entry point has no shader attribute");
   llvm::Triple::EnvironmentType ST = ShaderAttr->getType();
@@ -961,11 +967,12 @@ void SemaHLSL::checkSemanticAnnotation(
   }
 
   if (SemanticName == "SV_POSITION") {
-    // TODO(#143523): allow use on other shader types & output once the overall
-    // semantic logic is implemented.
-    if (ST == llvm::Triple::Pixel)
+    // SV_Position can be an input or output in vertex shaders,
+    // but only an input in pixel shaders.
+    if (ST == llvm::Triple::Vertex || (ST == llvm::Triple::Pixel && IsInput))
       return;
-    DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel});
+    DiagnoseAttrStageMismatch(SemanticAttr, ST,
+                              {llvm::Triple::Pixel, llvm::Triple::Vertex});
     return;
   }
 
diff --git a/clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl b/clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl
new file mode 100644
index 0000000000000..d4d89bd5d26ba
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:       FunctionDecl {{.*}} main 'void (S)'
+// CHECK-NEXT:  ParmVarDecl {{.*}} s 'S'
+// CHECK-NEXT:  HLSLParsedSemanticAttr {{.*}} "C" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "C" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "C" 1
+void main(S s : C) {}
diff --git a/clang/test/AST/HLSL/semantic-input-struct.hlsl b/clang/test/AST/HLSL/semantic-input-struct.hlsl
new file mode 100644
index 0000000000000..d71fdcff631f4
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-input-struct.hlsl
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:       FunctionDecl {{.*}} main 'void (S)'
+// CHECK-NEXT:  ParmVarDecl {{.*}} s 'S'
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "A" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "B" 4
+void main(S s) {}
diff --git a/clang/test/AST/HLSL/semantic-input.hlsl b/clang/test/AST/HLSL/semantic-input.hlsl
new file mode 100644
index 0000000000000..4dc3ab9db7392
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-input.hlsl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+// CHECK:      ParmVarDecl {{.*}} a 'float4':'vector<float, 4>'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "ABC" 0
+// CHECK-NEXT: HLSLAppliedSemanticAttr {{.*}} "ABC" 0
+
+void main(float4 a : ABC) {
+}
diff --git a/clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl b/clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl
new file mode 100644
index 0000000000000..e83901bb17943
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} referenced field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} referenced field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:      FunctionDecl {{.*}} main 'S ()'
+// CHECK:       HLSLParsedSemanticAttr {{.*}} "DEF" 0
+// CHECK:       HLSLAppliedSemanticAttr {{.*}} "DEF" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "DEF" 1
+S main() : DEF {
+  S tmp;
+  return tmp;
+}
diff --git a/clang/test/AST/HLSL/semantic-output-struct.hlsl b/clang/test/AST/HLSL/semantic-output-struct.hlsl
new file mode 100644
index 0000000000000..727c0f3040641
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-output-struct.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} referenced field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} referenced field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:      FunctionDecl {{.*}} main 'S ()'
+// CHECK:       HLSLAppliedSemanticAttr {{.*}} "A" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "B" 4
+S main() {
+  S tmp;
+  return tmp;
+}
diff --git a/clang/test/AST/HLSL/semantic-output.hlsl b/clang/test/AST/HLSL/semantic-output.hlsl
new file mode 100644
index 0000000000000..63429387f8d66
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-output.hlsl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+// CHECK: FunctionDecl {{.*}} main 'uint ()'
+// CHECK:  HLSLParsedSemanticAttr {{.*}} "ABC" 0
+// CHECK:  HLSLAppliedSemanticAttr {{.*}} "ABC" 0
+uint main() : ABC {
+  return 0;
+}
diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
index be30e79438831..b7d2283ea7766 100644
--- a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
@@ -1,11 +1,21 @@
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spirv-pc-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-DXIL
 
-// CHECK: @SV_Position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+// CHECK-SPIRV: @SV_Position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_0:]]
 
 // CHECK: define void @main() {{.*}} {
 float4 main(float4 p : SV_Position) : A {
-  // CHECK: %[[#P:]] = load <4 x float>, ptr addrspace(7) @SV_Position, align 16
-  // CHECK: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
-  // CHECK:            store <4 x float> %[[#R]], ptr addrspace(8) @A0, align 16
+  // CHECK-SPIRV: %[[#P:]] = load <4 x float>, ptr addrspace(7) @SV_Position, align 16
+  // CHECK-SPIRV: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
+  // CHECK-SPIRV:            store <4 x float> %[[#R]], ptr addrspace(8) @A0, align 16
+
+  // CHECK-DXIL: %SV_Position0 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+  // CHECK-DXIL:    %[[#TMP:]] = call <4 x float> @_Z4mainDv4_f(<4 x float> %SV_Position0)
+  // CHECK-DXIL:                 call void @llvm.dx.store.output.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison, <4 x float> %[[#TMP]])
   return p;
 }
+
+// CHECK-SPIRV-DAG: ![[#MD_0]] = !{![[#MD_1:]]}
+// CHECK-SPIRV-DAG: ![[#MD_1]] = !{i32 11, i32 15}
+//                                      |       `-> BuiltIn Position
+//                                      `-> SPIR-V decoration 'FragCoord'
diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl
new file mode 100644
index 0000000000000..0156c0bb816c1
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.8-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck --check-prefix=CHECK-DXIL %s
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck --check-prefix=CHECK-SPIRV  %s
+
+// CHECK-SPIRV: @SV_Position0 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_0:]]
+// CHECK-SPIRV: @SV_Position = external hidden thread_local addrspace(8) global <4 x float>, !spirv.Decorations ![[#MD_2:]]
+
+// CHECK: define void @main() {{.*}} {
+float4 main(float4 p : SV_Position) : SV_Position {
+  // CHECK-SPIRV: %[[#P:]] = load <4 x float>, ptr addrspace(7) @SV_Position0, align 16
+  // CHECK-SPIRV: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
+  // CHECK-SPIRV:            store <4 x float> %[[#R]], ptr addrspace(8) @SV_Position, align 16
+
+  // CHECK-DXIL: %SV_Position0 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+  // CHECK-DXIL:    %[[#TMP:]] = call <4 x float> @_Z4mainDv4_f(<4 x float> %SV_Position0)
+  // CHECK-DXIL:                 call void @llvm.dx.store.output.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison, <4 x float> %[[#TMP]])
+  return p;
+}
+
+// CHECK-SPIRV-DAG: ![[#MD_0]] = !{![[#MD_1:]]}
+// CHECK-SPIRV-DAG: ![[#MD_2]] = !{![[#MD_3:]]}
+// CHECK-SPIRV-DAG: ![[#MD_1]] = !{i32 30, i32 0}
+//                                      |       `-> Location 0
+//                                      `-> SPIR-V decoration 'Location'
+// CHECK-SPIRV-DAG: ![[#MD_3]] = !{i32 11, i32 0}
+//                                      |       `-> BuiltIn Position
+//                                      `-> SPIR-V decoration 'BuiltIn'
diff --git a/clang/test/SemaHLSL/Semantics/position.ps.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
index 2d02384821d90..47d07887911d6 100644
--- a/clang/test/SemaHLSL/Semantics/position.ps.hlsl
+++ b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
@@ -1,13 +1,7 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -finclude-default-header -x hlsl -verify -o - %s
+// RUN: %clang_cc1 -triple spirv-pc-vulkan1.3-pixel -finclude-default-header -x hlsl -verify -o - %s
 
-// FIXME(Keenuts): change output semantic to something valid for pixels shaders
-float4 main(float4 a : SV_Position2) : A {
-// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (float4)'
-// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 used a 'float4':'vector<float, 4>'
-// CHECK-NEXT:  HLSLParsedSemanticAttr 0x{{[0-9a-f]+}} <col:24> "SV_Position" 2
-// CHECK-NEXT:  HLSLAppliedSemanticAttr 0x{{[0-9a-f]+}} <col:24> "SV_Position" 2
-
-// CHECK:       HLSLParsedSemanticAttr 0x{{[0-9a-f]+}} <line:4:40> "A" 0
-// CHECK:       HLSLAppliedSemanticAttr 0x{{[0-9a-f]+}} <col:40> "A" 0
+float4 main(float4 a : A) : SV_Position {
+// expected-error@-1 {{attribute 'SV_Position' is unsupported in 'pixel' shaders, requires one of the following: pixel, vertex}}
   return a;
 }
diff --git a/clang/test/SemaHLSL/Semantics/position.vs.hlsl b/clang/test/SemaHLSL/Semantics/position.vs.hlsl
deleted file mode 100644
index 9d0ff285ce055..0000000000000
--- a/clang/test/SemaHLSL/Semantics/position.vs.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-vertex -x hlsl -finclude-default-header -o - %s -verify
-
-// expected-error@+1 {{attribute 'SV_Position' is unsupported in 'vertex' shaders, requires pixel}}
-float4 main(float4 a : SV_Position) : A {
-  return a;
-}
diff --git a/llvm/test/CodeGen/SPIRV/semantics/position.ps.ll b/llvm/test/CodeGen/SPIRV/semantics/position.ps.ll
new file mode 100644
index 0000000000000..2c02987f73928
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/semantics/position.ps.ll
@@ -0,0 +1,32 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        OpDecorate %[[#INPUT:]] BuiltIn FragCoord
+; CHECK-DAG:        OpDecorate %[[#OUTPUT:]] Location 0
+
+; CHECK-DAG:   %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG:      %[[#v4:]] = OpTypeVector %[[#float]] 4
+; CHECK-DAG:   %[[#ptr_i:]] = OpTypePointer Input %[[#v4]]
+; CHECK-DAG:   %[[#ptr_o:]] = OpTypePointer Output %[[#v4]]
+
+; CHECK-DAG:      %[[#INPUT]] = OpVariable %[[#ptr_i]] Input
+; CHECK-DAG:      %[[#OUTPUT]] = OpVariable %[[#ptr_o]] Output
+
+@SV_Position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+@A0 = external hidden thread_local addrspace(8) global <4 x float>, !spirv.Decorations !2
+
+define void @main() #1 {
+entry:
+  %0 = load <4 x float>, ptr addrspace(7) @SV_Position, align 16
+  store <4 x float> %0, ptr addrspace(8) @A0, align 16
+  ret void
+
+; CHECK: %[[#TMP:]] = OpLoad %[[#v4]] %[[#INPUT]] Aligned 16
+; CHECK:              OpStore %[[#OUTPUT]] %[[#TMP]] Aligned 16
+}
+
+!0 = !{!1}
+!1 = !{i32 11, i32 15}
+!2 = !{!3}
+!3 = !{i32 30, i32 0}
+
diff --git a/llvm/test/CodeGen/SPIRV/semantics/position.vs.ll b/llvm/test/CodeGen/SPIRV/semantics/position.vs.ll
new file mode 100644
index 0000000000000..73165f3719a97
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/semantics/position.vs.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        OpDecorate %[[#INPUT:]] Location 0
+; CHECK-DAG:        OpDecorate %[[#OUTPUT:]] BuiltIn Position
+
+; CHECK-DAG:   %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG:      %[[#v4:]] = OpTypeVector %[[#float]] 4
+; CHECK-DAG:   %[[#ptr_i:]] = OpTypePointer Input %[[#v4]]
+; CHECK-DAG:   %[[#ptr_o:]] = OpTypePointer Output %[[#v4]]
+
+; CHECK-DAG:      %[[#INPUT]] = OpVariable %[[#ptr_i]] Input
+; CHECK-DAG:      %[[#OUTPUT]] = OpVariable %[[#ptr_o]] Output
+
+@SV_Position0 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+@SV_Position = external hidden thread_local addrspace(8) global <4 x float>, !spirv.Decorations !2
+
+define void @main() #1 {
+entry:
+  %0 = load <4 x float>, ptr addrspace(7) @SV_Position0, align 16
+  store <4 x float> %0, ptr addrspace(8) @SV_Position, align 16
+  ret void
+
+; CHECK: %[[#TMP:]] = OpLoad %[[#v4]] %[[#INPUT]] Aligned 16
+; CHECK:              OpStore %[[#OUTPUT]] %[[#TMP]] Aligned 16
+}
+
+!0 = !{!1}
+!1 = !{i32 30, i32 0}
+!2 = !{!3}
+!3 = !{i32 11, i32 0}

From e4cff3c687fe909a2ff291576872aa06a55277ce Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 24 Nov 2025 12:42:36 +0000
Subject: [PATCH 03/19] [mlir] Avoid else after return in ScalableValueBounds
 (NFC) (#169211)

---
 .../Vector/IR/ScalableValueBoundsConstraintSet.cpp        | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
index a26edac98ea8d..2986f4c2d607d 100644
--- a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
+++ b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
@@ -106,14 +106,12 @@ ScalableValueBoundsConstraintSet::computeScalableBound(
 
   AffineMap bound = [&] {
     if (boundType == BoundType::EQ && !invalidBound(lowerBound) &&
-        lowerBound[0] == upperBound[0]) {
+        lowerBound[0] == upperBound[0])
       return lowerBound[0];
-    }
-    if (boundType == BoundType::LB && !invalidBound(lowerBound)) {
+    if (boundType == BoundType::LB && !invalidBound(lowerBound))
       return lowerBound[0];
-    } else if (boundType == BoundType::UB && !invalidBound(upperBound)) {
+    if (boundType == BoundType::UB && !invalidBound(upperBound))
       return upperBound[0];
-    }
     return AffineMap{};
   }();
 

From 65fd9f1f891bcc4bc1a27a00a45a4c1d9670ae63 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 24 Nov 2025 12:49:25 +0000
Subject: [PATCH 04/19] [Attributor] Support nested conditional branches
 (#168532)

The attributor can infer the alignment of %p at the call-site in this
example [1]:

```
  define void @f(ptr align 8 %p, i1 %c1, i1 %c2) {
  entry:
    br i1 %c1, label %bb.1, label %exit

  bb.1:
    call void (...) @llvm.fake.use(ptr %p)
    br label %exit

  exit:
    ret void
  }
```

but not when there's an additional conditional branch:

```
  define void @f(ptr align 8 %p, i1 %c1, i1 %c2) {
  entry:
    br i1 %c1, label %bb.1, label %exit

  bb.1:
    br i1 %c2, label %bb.2, label %exit

  bb.2:
    call void (...) @llvm.fake.use(ptr %p)
    br label %exit

  exit:
    ret void
  }
```

unless `-attributor-annotate-decl-cs` is enabled. This patch extends
`followUsesInMBEC` to handle such recursive branches.

n.b. admittedly I wrote this patch before discovering inferring the
alignment in this example is already possible with
`-attributor-annotate-decl-cs`, I came to realise this once writing the
tests, but this seems like a gap regardless looking at existing FIXMEs,
plus the alignment can now be inferred in this particular example
without the flag.

[1] https://godbolt.org/z/aKoc75so5
---
 .../Transforms/IPO/AttributorAttributes.cpp   |  32 +-
 .../Attributor/dereferenceable-1.ll           |   9 +-
 llvm/test/Transforms/Attributor/nonnull.ll    | 411 +++++++++---------
 .../Attributor/value-simplify-pointer-info.ll |   6 +-
 llvm/test/Transforms/Attributor/willreturn.ll |   2 +-
 llvm/test/Transforms/FunctionAttrs/nonnull.ll |   3 +-
 6 files changed, 234 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index a6ac7610a2c7a..e806a02a1f58f 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -665,7 +665,10 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
     return;
 
   SmallVector<const BranchInst *, 4> BrInsts;
+  SmallPtrSet<const Instruction *, 16> Visited;
   auto Pred = [&](const Instruction *I) {
+    if (!Visited.insert(I).second)
+      return false;
     if (const BranchInst *Br = dyn_cast<BranchInst>(I))
       if (Br->isConditional())
         BrInsts.push_back(Br);
@@ -684,28 +687,10 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
   // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m}
   //
   // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m
-  //
-  // FIXME: Currently, recursive branches are not handled. For example, we
-  // can't deduce that ptr must be dereferenced in below function.
-  //
-  // void f(int a, int c, int *ptr) {
-  //    if(a)
-  //      if (b) {
-  //        *ptr = 0;
-  //      } else {
-  //        *ptr = 1;
-  //      }
-  //    else {
-  //      if (b) {
-  //        *ptr = 0;
-  //      } else {
-  //        *ptr = 1;
-  //      }
-  //    }
-  // }
 
   Explorer->checkForAllContext(&CtxI, Pred);
-  for (const BranchInst *Br : BrInsts) {
+  while (!BrInsts.empty()) {
+    const BranchInst *Br = BrInsts.pop_back_val();
     StateType ParentState;
 
     // The known state of the parent state is a conjunction of children's
@@ -714,15 +699,18 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
 
     for (const BasicBlock *BB : Br->successors()) {
       StateType ChildState;
-
       size_t BeforeSize = Uses.size();
-      followUsesInContext(AA, A, *Explorer, &BB->front(), Uses, ChildState);
+      const Instruction *I = &BB->front();
+      followUsesInContext(AA, A, *Explorer, I, Uses, ChildState);
 
       // Erase uses which only appear in the child.
       for (auto It = Uses.begin() + BeforeSize; It != Uses.end();)
         It = Uses.erase(It);
 
       ParentState &= ChildState;
+
+      // Check for recursive conditional branches.
+      Explorer->checkForAllContext(I, Pred);
     }
 
     // Use only known state.
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 5bff2a2e6b208..246a8c42ba912 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -555,12 +555,10 @@ cont2:
 ;        *ptr = 4;
 ;    }
 ;  }
-;
-; FIXME: %ptr should be dereferenceable(4)
 define dso_local void @rec-branch-1(i32 %a, i32 %b, i32 %c, ptr %ptr) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; CHECK-LABEL: define {{[^@]+}}@rec-branch-1
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree writeonly captures(none) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE3:%.*]], label [[IF_THEN:%.*]]
@@ -630,11 +628,10 @@ if.end8:                                          ; preds = %if.then5, %if.else6
 ;        rec-branch-2(1, 1, 1, ptr);
 ;    }
 ;  }
-; FIXME: %ptr should be dereferenceable(4)
 define dso_local void @rec-branch-2(i32 %a, i32 %b, i32 %c, ptr %ptr) {
 ; CHECK: Function Attrs: nofree nosync nounwind memory(argmem: write)
 ; CHECK-LABEL: define {{[^@]+}}@rec-branch-2
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree writeonly captures(none) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE3:%.*]], label [[IF_THEN:%.*]]
@@ -654,7 +651,7 @@ define dso_local void @rec-branch-2(i32 %a, i32 %b, i32 %c, ptr %ptr) {
 ; CHECK-NEXT:    store i32 3, ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[IF_END8]]
 ; CHECK:       if.else6:
-; CHECK-NEXT:    tail call void @rec-branch-2(i32 noundef 1, i32 noundef 1, i32 noundef 1, ptr nofree writeonly captures(none) [[PTR]]) #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:    tail call void @rec-branch-2(i32 noundef 1, i32 noundef 1, i32 noundef 1, ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]]) #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    br label [[IF_END8]]
 ; CHECK:       if.end8:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
index 2ff8a3fa3a688..57a6d09af64fa 100644
--- a/llvm/test/Transforms/Attributor/nonnull.ll
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -32,16 +32,27 @@ define ptr @test2(ptr nonnull %p) {
 }
 
 define ptr @test2A(i1 %c, ptr %ret) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
-; CHECK-LABEL: define {{[^@]+}}@test2A
-; CHECK-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
-; CHECK:       A:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16:[0-9]+]] [ "nonnull"(ptr [[RET]]) ]
-; CHECK-NEXT:    ret ptr [[RET]]
-; CHECK:       B:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nonnull"(ptr [[RET]]) ]
-; CHECK-NEXT:    ret ptr [[RET]]
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; TUNIT-LABEL: define {{[^@]+}}@test2A
+; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2:[0-9]+]] {
+; TUNIT-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; TUNIT:       A:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15:[0-9]+]] [ "nonnull"(ptr [[RET]]) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+; TUNIT:       B:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "nonnull"(ptr [[RET]]) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; CGSCC-LABEL: define {{[^@]+}}@test2A
+; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2:[0-9]+]] {
+; CGSCC-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; CGSCC:       A:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16:[0-9]+]] [ "nonnull"(ptr [[RET]]) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
+; CGSCC:       B:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nonnull"(ptr [[RET]]) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
 ;
   br i1 %c, label %A, label %B
 A:
@@ -53,16 +64,27 @@ B:
 }
 
 define ptr @test2B(i1 %c, ptr %ret) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
-; CHECK-LABEL: define {{[^@]+}}@test2B
-; CHECK-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned dereferenceable(4) "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
-; CHECK:       A:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
-; CHECK-NEXT:    ret ptr [[RET]]
-; CHECK:       B:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
-; CHECK-NEXT:    ret ptr [[RET]]
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; TUNIT-LABEL: define {{[^@]+}}@test2B
+; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned dereferenceable(4) "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; TUNIT:       A:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+; TUNIT:       B:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; CGSCC-LABEL: define {{[^@]+}}@test2B
+; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned dereferenceable(4) "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; CGSCC:       A:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
+; CGSCC:       B:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
 ;
   br i1 %c, label %A, label %B
 A:
@@ -273,13 +295,21 @@ define ptr @test9(ptr %a, i64 %n) {
 ; ATTRIBUTOR_OPM: define ptr @test10
 ; ATTRIBUTOR_NPM: define nonnull ptr @test10
 define ptr @test10(ptr %a, i64 %n) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
-; CHECK-LABEL: define {{[^@]+}}@test10
-; CHECK-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[A:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CMP]]) #[[ATTR16]]
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
-; CHECK-NEXT:    ret ptr [[B]]
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; TUNIT-LABEL: define {{[^@]+}}@test10
+; TUNIT-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[A:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef [[CMP]]) #[[ATTR15]]
+; TUNIT-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
+; TUNIT-NEXT:    ret ptr [[B]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; CGSCC-LABEL: define {{[^@]+}}@test10
+; CGSCC-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[A:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef [[CMP]]) #[[ATTR16]]
+; CGSCC-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
+; CGSCC-NEXT:    ret ptr [[B]]
 ;
   %cmp = icmp ne i64 %n, 0
   call void @llvm.assume(i1 %cmp)
@@ -392,50 +422,22 @@ declare nonnull ptr @nonnull()
 
 
 define internal ptr @f1(ptr %arg) {
-; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg)
-; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: read)
-; TUNIT-LABEL: define {{[^@]+}}@f1
-; TUNIT-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR6:[0-9]+]] {
-; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
-; TUNIT-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
-; TUNIT:       bb1:
-; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
-; TUNIT-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
-; TUNIT:       bb4:
-; TUNIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; TUNIT-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR17:[0-9]+]]
-; TUNIT-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
-; TUNIT-NEXT:    br label [[BB9]]
-; TUNIT:       bb6:
-; TUNIT-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    ret ptr [[TMP7]]
-; TUNIT:       bb9:
-; TUNIT-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
-; TUNIT-NEXT:    ret ptr [[TMP10]]
-;
-; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read)
 ; CGSCC-LABEL: define {{[^@]+}}@f1
-; CGSCC-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree nonnull readonly align 4 captures(none) dereferenceable(4) [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
-; CGSCC-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
-; CGSCC-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
+; CGSCC-NEXT:    br label [[BB1:%.*]]
 ; CGSCC:       bb1:
-; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
+; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
 ; CGSCC:       bb4:
-; CGSCC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; CGSCC-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR17:[0-9]+]]
-; CGSCC-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
-; CGSCC-NEXT:    br label [[BB9]]
+; CGSCC-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr undef, i64 -1
+; CGSCC-NEXT:    br label [[BB9:%.*]]
 ; CGSCC:       bb6:
-; CGSCC-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    ret ptr [[TMP7]]
+; CGSCC-NEXT:    ret ptr undef
 ; CGSCC:       bb9:
-; CGSCC-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
-; CGSCC-NEXT:    ret ptr [[TMP10]]
+; CGSCC-NEXT:    ret ptr undef
 ;
 
 bb:
@@ -463,19 +465,11 @@ bb9:                                              ; preds = %bb4, %bb
 }
 
 define internal ptr @f2(ptr %arg) {
-; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: read)
-; TUNIT-LABEL: define {{[^@]+}}@f2
-; TUNIT-SAME: (ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR6]] {
-; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    ret ptr [[TMP]]
-;
-; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@f2
-; CGSCC-SAME: (ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR5]] {
+; CGSCC-SAME: (ptr noalias nofree nonnull readnone align 4 captures(none) dereferenceable(4) [[ARG:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
-; CGSCC-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    ret ptr [[TMP]]
+; CGSCC-NEXT:    ret ptr undef
 ;
 bb:
   %tmp = tail call ptr @f1(ptr %arg)
@@ -484,19 +478,17 @@ bb:
 
 define dso_local noalias ptr @f3(ptr %arg) {
 ; FIXME: missing nonnull. It should be nonnull @f3(ptr nonnull readonly %arg)
-; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; TUNIT: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@f3
-; TUNIT-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR6]] {
+; TUNIT-SAME: (ptr nofree readnone captures(none) [[ARG:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    ret ptr [[TMP]]
+; TUNIT-NEXT:    ret ptr undef
 ;
-; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@f3
-; CGSCC-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR5]] {
+; CGSCC-SAME: (ptr nofree readnone captures(none) [[ARG:%.*]]) #[[ATTR1]] {
 ; CGSCC-NEXT:  bb:
-; CGSCC-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    ret ptr [[TMP]]
+; CGSCC-NEXT:    ret ptr undef
 ;
 bb:
 ; FIXME: missing nonnull. It should be @f1(ptr nonnull readonly %arg)
@@ -529,26 +521,26 @@ declare void @fun3(ptr, ptr, ptr) #1
 define void @f16(ptr %a, ptr %b, i8 %c) {
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@f16
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
-; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR7:[0-9]+]]
+; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR6:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ; TUNIT:       if.else:
-; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@f16
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
-; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR6:[0-9]+]]
+; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR7:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ; CGSCC:       if.else:
-; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -571,32 +563,32 @@ define void @f17(ptr %a, i8 %c) {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@f17
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT:%.*]]
 ; TUNIT:       if.else:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT]]
 ; TUNIT:       cont:
-; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@f17
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT:%.*]]
 ; CGSCC:       if.else:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT]]
 ; CGSCC:       cont:
-; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -625,50 +617,50 @@ cont:
 define void @f18(ptr %a, ptr %b, i8 %c) {
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@f18
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[C]], 0
 ; TUNIT-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT:%.*]]
 ; TUNIT:       if.else:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT]]
 ; TUNIT:       cont:
 ; TUNIT-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 1
 ; TUNIT-NEXT:    br i1 [[CMP2]], label [[CONT_THEN:%.*]], label [[CONT_ELSE:%.*]]
 ; TUNIT:       cont.then:
-; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT2:%.*]]
 ; TUNIT:       cont.else:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT2]]
 ; TUNIT:       cont2:
-; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@f18
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[C]], 0
 ; CGSCC-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT:%.*]]
 ; CGSCC:       if.else:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT]]
 ; CGSCC:       cont:
 ; CGSCC-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 1
 ; CGSCC-NEXT:    br i1 [[CMP2]], label [[CONT_THEN:%.*]], label [[CONT_ELSE:%.*]]
 ; CGSCC:       cont.then:
-; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT2:%.*]]
 ; CGSCC:       cont.else:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT2]]
 ; CGSCC:       cont2:
-; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret void
 ;
   %cmp1 = icmp eq i8 %c, 0
@@ -857,11 +849,17 @@ define i8 @parent6(ptr %a, ptr %b) {
 ; The nonnull callsite is guaranteed to execute, so the argument must be nonnull throughout the parent.
 
 define i8 @parent7(ptr %a) {
-; CHECK-LABEL: define {{[^@]+}}@parent7
-; CHECK-SAME: (ptr nonnull [[A:%.*]]) {
-; CHECK-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull readonly [[A]]) #[[ATTR18:[0-9]+]]
-; CHECK-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
-; CHECK-NEXT:    ret i8 [[RET]]
+; TUNIT-LABEL: define {{[^@]+}}@parent7
+; TUNIT-SAME: (ptr nonnull [[A:%.*]]) {
+; TUNIT-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull readonly [[A]]) #[[ATTR16:[0-9]+]]
+; TUNIT-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
+; TUNIT-NEXT:    ret i8 [[RET]]
+;
+; CGSCC-LABEL: define {{[^@]+}}@parent7
+; CGSCC-SAME: (ptr nonnull [[A:%.*]]) {
+; CGSCC-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull readonly [[A]]) #[[ATTR17:[0-9]+]]
+; CGSCC-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
+; CGSCC-NEXT:    ret i8 [[RET]]
 ;
 
 
@@ -931,13 +929,13 @@ define ptr @gep1_no_null_opt(ptr %p) #0 {
 ; Should't be able to derive nonnull based on gep.
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@gep1_no_null_opt
-; TUNIT-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; TUNIT-NEXT:    [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1
 ; TUNIT-NEXT:    ret ptr [[Q]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@gep1_no_null_opt
-; CGSCC-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR9:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CGSCC-NEXT:    [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1
 ; CGSCC-NEXT:    ret ptr [[Q]]
 ;
@@ -983,8 +981,8 @@ define ptr @g1() {
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@g1
-; CGSCC-SAME: () #[[ATTR10:[0-9]+]] {
-; CGSCC-NEXT:    [[C:%.*]] = call noundef nonnull align 4 ptr @g2() #[[ATTR19:[0-9]+]]
+; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-NEXT:    [[C:%.*]] = call noundef nonnull align 4 ptr @g2() #[[ATTR18:[0-9]+]]
 ; CGSCC-NEXT:    ret ptr [[C]]
 ;
   %c = call ptr @g2()
@@ -1045,21 +1043,32 @@ define internal void @control(ptr dereferenceable(4) %a) {
 }
 ; Avoid nonnull as we do not touch naked functions
 define internal void @naked(ptr dereferenceable(4) %a) naked {
-; CHECK: Function Attrs: naked
-; CHECK-LABEL: define {{[^@]+}}@naked
-; CHECK-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: naked
+; TUNIT-LABEL: define {{[^@]+}}@naked
+; TUNIT-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: naked
+; CGSCC-LABEL: define {{[^@]+}}@naked
+; CGSCC-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
+; CGSCC-NEXT:    ret void
 ;
   ret void
 }
 ; Avoid nonnull as we do not touch optnone
 define internal void @optnone(ptr dereferenceable(4) %a) optnone noinline {
 ;
-; CHECK: Function Attrs: noinline optnone
-; CHECK-LABEL: define {{[^@]+}}@optnone
-; CHECK-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
-; CHECK-NEXT:    call void @use_i32_ptr(ptr nofree noundef nonnull captures(none) [[A]])
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: noinline optnone
+; TUNIT-LABEL: define {{[^@]+}}@optnone
+; TUNIT-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
+; TUNIT-NEXT:    call void @use_i32_ptr(ptr nofree noundef nonnull captures(none) [[A]])
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: noinline optnone
+; CGSCC-LABEL: define {{[^@]+}}@optnone
+; CGSCC-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
+; CGSCC-NEXT:    call void @use_i32_ptr(ptr nofree noundef nonnull captures(none) [[A]])
+; CGSCC-NEXT:    ret void
 ;
   call void @use_i32_ptr(ptr %a)
   ret void
@@ -1098,32 +1107,32 @@ define i32 @nonnull_exec_ctx_1(ptr %a, i32 %b) {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
-; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
 ; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
 ; TUNIT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; TUNIT-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
-; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
 ; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
 ; CGSCC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; CGSCC-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
@@ -1148,16 +1157,16 @@ define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
-; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
 ; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[HD2]]
 ; TUNIT:       hd2:
 ; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
@@ -1166,16 +1175,16 @@ define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
-; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
 ; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[HD2]]
 ; CGSCC:       hd2:
 ; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
@@ -1205,7 +1214,7 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1221,7 +1230,7 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1255,7 +1264,7 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1273,7 +1282,7 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1392,8 +1401,8 @@ declare ptr @strrchr(ptr %0, i32 %1) nofree nounwind readonly willreturn
 define ptr @mybasename(ptr nofree readonly %str) {
 ; TUNIT: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read)
 ; TUNIT-LABEL: define {{[^@]+}}@mybasename
-; TUNIT-SAME: (ptr nofree readonly [[STR:%.*]]) #[[ATTR14:[0-9]+]] {
-; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR19:[0-9]+]]
+; TUNIT-SAME: (ptr nofree readonly [[STR:%.*]]) #[[ATTR13:[0-9]+]] {
+; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR17:[0-9]+]]
 ; TUNIT-NEXT:    [[TOBOOL:%.*]] = icmp ne ptr [[CALL]], null
 ; TUNIT-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 1
 ; TUNIT-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], ptr [[ADD_PTR]], ptr [[STR]]
@@ -1402,7 +1411,7 @@ define ptr @mybasename(ptr nofree readonly %str) {
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@mybasename
 ; CGSCC-SAME: (ptr nofree readonly [[STR:%.*]]) #[[ATTR14:[0-9]+]] {
-; CGSCC-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR20:[0-9]+]]
+; CGSCC-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR19:[0-9]+]]
 ; CGSCC-NEXT:    [[TOBOOL:%.*]] = icmp ne ptr [[CALL]], null
 ; CGSCC-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 1
 ; CGSCC-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], ptr [[ADD_PTR]], ptr [[STR]]
@@ -1425,7 +1434,7 @@ define void @nonnull_assume_pos(ptr %arg) {
 ;
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_assume_pos
 ; TUNIT-SAME: (ptr nofree nonnull readnone captures(none) [[ARG:%.*]]) {
-; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nonnull"(ptr [[ARG]]) ]
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "nonnull"(ptr [[ARG]]) ]
 ; TUNIT-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[ARG]]) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP1:%.*]] = call ptr @unknown()
 ; TUNIT-NEXT:    ret void
@@ -1554,14 +1563,14 @@ define void @phi_caller(ptr %p) {
 ; TUNIT: Function Attrs: nounwind
 ; TUNIT-LABEL: define {{[^@]+}}@phi_caller
 ; TUNIT-SAME: (ptr nofree [[P:%.*]]) #[[ATTR5]] {
-; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR20:[0-9]+]]
+; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR18:[0-9]+]]
 ; TUNIT-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR5]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nounwind
 ; CGSCC-LABEL: define {{[^@]+}}@phi_caller
 ; CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR21:[0-9]+]]
+; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR20:[0-9]+]]
 ; CGSCC-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR4]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -1594,14 +1603,14 @@ define void @multi_ret_caller(ptr %p) {
 ; TUNIT: Function Attrs: nounwind
 ; TUNIT-LABEL: define {{[^@]+}}@multi_ret_caller
 ; TUNIT-SAME: (ptr nofree [[P:%.*]]) #[[ATTR5]] {
-; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR18]]
 ; TUNIT-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR5]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nounwind
 ; CGSCC-LABEL: define {{[^@]+}}@multi_ret_caller
 ; CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR21]]
+; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR20]]
 ; CGSCC-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR4]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -1613,18 +1622,31 @@ define void @multi_ret_caller(ptr %p) {
 ; From https://github.com/llvm/llvm-project/pull/85810
 @G = internal global i64 1, align 8
 define dso_local ptr @update_global_in_alive_bb() {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CHECK-LABEL: define {{[^@]+}}@update_global_in_alive_bb
-; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @G, align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i64 0, ptr @G, align 8
-; CHECK-NEXT:    ret ptr inttoptr (i64 5 to ptr)
-; CHECK:       if.else:
-; CHECK-NEXT:    ret ptr null
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+; TUNIT-LABEL: define {{[^@]+}}@update_global_in_alive_bb
+; TUNIT-SAME: () #[[ATTR14:[0-9]+]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i64, ptr @G, align 8
+; TUNIT-NEXT:    [[CMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; TUNIT:       if.then:
+; TUNIT-NEXT:    store i64 0, ptr @G, align 8
+; TUNIT-NEXT:    ret ptr inttoptr (i64 5 to ptr)
+; TUNIT:       if.else:
+; TUNIT-NEXT:    ret ptr null
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+; CGSCC-LABEL: define {{[^@]+}}@update_global_in_alive_bb
+; CGSCC-SAME: () #[[ATTR15:[0-9]+]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i64, ptr @G, align 8
+; CGSCC-NEXT:    [[CMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CGSCC:       if.then:
+; CGSCC-NEXT:    store i64 0, ptr @G, align 8
+; CGSCC-NEXT:    ret ptr inttoptr (i64 5 to ptr)
+; CGSCC:       if.else:
+; CGSCC-NEXT:    ret ptr null
 ;
 entry:
   %0 = load i64, ptr @G, align 8
@@ -1640,48 +1662,47 @@ if.else:
 attributes #0 = { null_pointer_is_valid }
 attributes #1 = { nounwind willreturn}
 ;.
-; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write) }
-; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR4]] = { noreturn }
-; TUNIT: attributes #[[ATTR5]] = { nounwind }
-; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind memory(argmem: read) }
-; TUNIT: attributes #[[ATTR7]] = { nounwind willreturn }
-; TUNIT: attributes #[[ATTR8]] = { mustprogress nounwind willreturn }
-; TUNIT: attributes #[[ATTR9:[0-9]+]] = { nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
-; TUNIT: attributes #[[ATTR11]] = { naked }
-; TUNIT: attributes #[[ATTR12]] = { noinline optnone }
-; TUNIT: attributes #[[ATTR13:[0-9]+]] = { nofree nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR14]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR15]] = { mustprogress nofree norecurse nosync nounwind willreturn }
-; TUNIT: attributes #[[ATTR16]] = { nofree willreturn memory(write) }
-; TUNIT: attributes #[[ATTR17]] = { nofree nosync nounwind memory(read) }
-; TUNIT: attributes #[[ATTR18]] = { nosync willreturn memory(read) }
-; TUNIT: attributes #[[ATTR19]] = { nofree nosync willreturn memory(read) }
-; TUNIT: attributes #[[ATTR20]] = { nofree nosync nounwind willreturn memory(none) }
-;.
 ; CGSCC: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write) }
 ; CGSCC: attributes #[[ATTR3]] = { noreturn }
 ; CGSCC: attributes #[[ATTR4]] = { nounwind }
-; CGSCC: attributes #[[ATTR5]] = { nofree nosync nounwind memory(argmem: read) }
-; CGSCC: attributes #[[ATTR6]] = { nounwind willreturn }
-; CGSCC: attributes #[[ATTR7]] = { mustprogress nounwind willreturn }
-; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nounwind willreturn memory(read) }
-; CGSCC: attributes #[[ATTR9]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
-; CGSCC: attributes #[[ATTR10]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: read) }
+; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; CGSCC: attributes #[[ATTR7]] = { nounwind willreturn }
+; CGSCC: attributes #[[ATTR8]] = { mustprogress nounwind willreturn }
+; CGSCC: attributes #[[ATTR9:[0-9]+]] = { nounwind willreturn memory(read) }
+; CGSCC: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR11]] = { naked }
 ; CGSCC: attributes #[[ATTR12]] = { noinline optnone }
 ; CGSCC: attributes #[[ATTR13:[0-9]+]] = { nofree nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR14]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR15]] = { mustprogress nofree norecurse nosync nounwind willreturn }
 ; CGSCC: attributes #[[ATTR16]] = { nofree willreturn memory(write) }
-; CGSCC: attributes #[[ATTR17]] = { nofree nosync nounwind memory(read) }
-; CGSCC: attributes #[[ATTR18]] = { nosync willreturn memory(read) }
-; CGSCC: attributes #[[ATTR19]] = { nofree nosync willreturn }
-; CGSCC: attributes #[[ATTR20]] = { nofree nosync willreturn memory(read) }
-; CGSCC: attributes #[[ATTR21]] = { nofree willreturn }
+; CGSCC: attributes #[[ATTR17]] = { nosync willreturn memory(read) }
+; CGSCC: attributes #[[ATTR18]] = { nofree nosync willreturn }
+; CGSCC: attributes #[[ATTR19]] = { nofree nosync willreturn memory(read) }
+; CGSCC: attributes #[[ATTR20]] = { nofree willreturn }
+;.
+; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write) }
+; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR4]] = { noreturn }
+; TUNIT: attributes #[[ATTR5]] = { nounwind }
+; TUNIT: attributes #[[ATTR6]] = { nounwind willreturn }
+; TUNIT: attributes #[[ATTR7]] = { mustprogress nounwind willreturn }
+; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR9]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
+; TUNIT: attributes #[[ATTR10]] = { naked }
+; TUNIT: attributes #[[ATTR11]] = { noinline optnone }
+; TUNIT: attributes #[[ATTR12:[0-9]+]] = { nofree nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR13]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR14]] = { mustprogress nofree norecurse nosync nounwind willreturn }
+; TUNIT: attributes #[[ATTR15]] = { nofree willreturn memory(write) }
+; TUNIT: attributes #[[ATTR16]] = { nosync willreturn memory(read) }
+; TUNIT: attributes #[[ATTR17]] = { nofree nosync willreturn memory(read) }
+; TUNIT: attributes #[[ATTR18]] = { nofree nosync nounwind willreturn memory(none) }
+;.
+; CGSCC: [[META0]] = !{}
 ;.
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index 3e07fe42261e9..2235f194af8ea 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -1267,7 +1267,7 @@ entry:
 define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
 ; TUNIT-LABEL: define void @noalias_arg_simplifiable_2(
-; TUNIT-SAME: ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
+; TUNIT-SAME: ptr nofree nonnull captures(none) dereferenceable(24) [[BYTES:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:  [[ENTRY:.*]]:
 ; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
 ; TUNIT:       [[FOR_COND]]:
@@ -1344,7 +1344,7 @@ define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
 ; CGSCC-LABEL: define void @noalias_arg_simplifiable_2(
-; CGSCC-SAME: ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
+; CGSCC-SAME: ptr nofree nonnull align 4 captures(none) dereferenceable(1024) [[BYTES:%.*]]) #[[ATTR3]] {
 ; CGSCC-NEXT:  [[ENTRY:.*]]:
 ; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
 ; CGSCC:       [[FOR_COND]]:
@@ -1399,7 +1399,7 @@ define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ; CGSCC-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 1023
 ; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX24]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 500
-; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[ARRAYIDX25]], i32 noundef 0) #[[ATTR21]]
+; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(524) [[ARRAYIDX25]], i32 noundef 0) #[[ATTR21]]
 ; CGSCC-NEXT:    br label %[[FOR_COND27:.*]]
 ; CGSCC:       [[FOR_COND27]]:
 ; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC35:.*]] ], [ 0, %[[FOR_END23]] ]
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index d65480b05759a..543f33ee0621b 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -238,7 +238,7 @@ define void @only_exit() local_unnamed_addr #0 {
 define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_addr #0 {
 ; CHECK: Function Attrs: noinline nounwind uwtable
 ; CHECK-LABEL: define {{[^@]+}}@conditional_exit
-; CHECK-SAME: (i32 [[TMP0:%.*]], ptr nofree readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr nofree nonnull readonly align 4 captures(none) dereferenceable(4) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
 ; CHECK:       4:
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index 9d5ae1606f2e3..e06fb1cfd9656 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -360,7 +360,6 @@ declare nonnull ptr @nonnull()
 
 
 define internal ptr @f1(ptr %arg) {
-; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg)
 ; FNATTRS-LABEL: define internal nonnull ptr @f1(
 ; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
 ; FNATTRS-NEXT:  bb:
@@ -383,7 +382,7 @@ define internal ptr @f1(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP10]]
 ;
 ; ATTRIBUTOR-LABEL: define internal ptr @f1(
-; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nofree nonnull readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]

From 999deef63df5a057350a1e3bf211e536d5cfbc82 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Mon, 24 Nov 2025 07:51:55 -0500
Subject: [PATCH 05/19] Desugar complex element types for promoted complex
 division (#168943)

This patch fixes a crash in Clang that occurs when the compiler
retrieves the element type of a complex type but receives a sugared
type. See example here: https://godbolt.org/z/cdbdeMcaT
This patch fixes the crash.
---
 clang/lib/CodeGen/CGExprComplex.cpp       |  2 +-
 clang/lib/Sema/SemaExpr.cpp               |  2 +-
 clang/test/CodeGen/promoted-complex-div.c | 52 +++++++++++++++++++++++
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index d281c4c20616a..bca7c30557f03 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -320,7 +320,7 @@ class ComplexExprEmitter
   QualType getPromotionType(FPOptionsOverride Features, QualType Ty,
                             bool IsComplexDivisor) {
     if (auto *CT = Ty->getAs<ComplexType>()) {
-      QualType ElementType = CT->getElementType();
+      QualType ElementType = CT->getElementType().getCanonicalType();
       bool IsFloatingType = ElementType->isFloatingType();
       bool IsComplexRangePromoted = CGF.getLangOpts().getComplexRange() ==
                                     LangOptions::ComplexRangeKind::CX_Promoted;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 10f0ec3010c6c..d3c2cc559ea20 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -10726,7 +10726,7 @@ static void DetectPrecisionLossInComplexDivision(Sema &S, QualType DivisorTy,
   if (!CT)
     return;
 
-  QualType ElementType = CT->getElementType();
+  QualType ElementType = CT->getElementType().getCanonicalType();
   bool IsComplexRangePromoted = S.getLangOpts().getComplexRange() ==
                                 LangOptions::ComplexRangeKind::CX_Promoted;
   if (!ElementType->isFloatingType() || !IsComplexRangePromoted)
diff --git a/clang/test/CodeGen/promoted-complex-div.c b/clang/test/CodeGen/promoted-complex-div.c
index 7ed7b07db83ae..006b5e334e6ea 100644
--- a/clang/test/CodeGen/promoted-complex-div.c
+++ b/clang/test/CodeGen/promoted-complex-div.c
@@ -81,3 +81,55 @@ _Complex double divf(_Complex double a, _Complex double b) {
 
   return a / b; // nopromotion-warning{{excess precision is requested but the target does not support excess precision which may result in observable differences in complex division behavior}}
 }
+
+// This test ensures that Clang does not crash when complex element types
+// require desugaring under -complex-range=promoted. Previously, a sugared
+// typedef element type (e.g., 'typedef double a') caused a crash during
+// complex range evaluation in both Sema and CodeGen.
+typedef double a;
+_Complex double *b;
+// CHECK-LABEL: define dso_local void @DivideByComplexZero
+void DivideByComplexZero() {
+  // CHECK: fpext double {{.*}} to x86_fp80
+  // CHECK: fpext double {{.*}} to x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fadd x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fsub x86_fp80
+  // CHECK: fdiv x86_fp80
+  // CHECK: fdiv x86_fp80
+  // CHECK: fptrunc x86_fp80
+  // CHECK: fptrunc x86_fp80
+
+  // NOX87: call double @llvm.fabs.f64(double {{.*}})
+  // NOX87-NEXT: call double @llvm.fabs.f64(double {{.*}}
+  // NOX87-NEXT: fcmp ugt double {{.*}}, {{.*}}
+  // NOX87-NEXT: br i1 {{.*}}, label
+  // NOX87: abs_rhsr_greater_or_equal_abs_rhsi:
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fadd double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fsub double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: br label {{.*}}
+  // NOX87: abs_rhsr_less_than_abs_rhsi:
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fadd double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fsub double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: br label {{.*}}
+  // NOX87: complex_div:
+  // NOX87-NEXT: phi double
+  // NOX87-NEXT: phi double
+  // NOX87-NEXT: getelementptr inbounds nuw { double, double }, ptr {{.*}}, i32 0, i32 0
+  // NOX87-NEXT: getelementptr inbounds nuw { double, double }, ptr {{.*}}, i32 0, i32 1
+  // NOX87-NEXT: store double
+  // NOX87-NEXT: store double
+
+  *b /= 1.0iF * (a)0;
+}

From e5755395417ceaa9cd049e69593cb0dcc7d0e65c Mon Sep 17 00:00:00 2001
From: Jack Frankland <jack.frankland@arm.com>
Date: Mon, 24 Nov 2025 12:58:34 +0000
Subject: [PATCH 06/19] [milr][memref]: Fold expand_shape + transfer_read
 (#167679)

Extend the load of a expand shape rewrite pattern to support folding a
`memref.expand_shape` and `vector.transfer_read` when the permutation
map on `vector.transfer_read` is a minor identity.

---------

Signed-off-by: Jack Frankland <jack.frankland@arm.com>
---
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 32 +++++++++++-
 .../Dialect/MemRef/fold-memref-alias-ops.mlir | 49 +++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 214410f78e51c..3667fdb2bb728 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -347,28 +347,55 @@ LogicalResult LoadOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
           loadOp.getLoc(), rewriter, expandShapeOp, indices, sourceIndices,
           isa<affine::AffineLoadOp, memref::LoadOp>(loadOp.getOperation()))))
     return failure();
-  llvm::TypeSwitch<Operation *, void>(loadOp)
+
+  return llvm::TypeSwitch<Operation *, LogicalResult>(loadOp)
       .Case([&](affine::AffineLoadOp op) {
         rewriter.replaceOpWithNewOp<affine::AffineLoadOp>(
             loadOp, expandShapeOp.getViewSource(), sourceIndices);
+        return success();
       })
       .Case([&](memref::LoadOp op) {
         rewriter.replaceOpWithNewOp<memref::LoadOp>(
             loadOp, expandShapeOp.getViewSource(), sourceIndices,
             op.getNontemporal());
+        return success();
       })
       .Case([&](vector::LoadOp op) {
         rewriter.replaceOpWithNewOp<vector::LoadOp>(
             op, op.getType(), expandShapeOp.getViewSource(), sourceIndices,
             op.getNontemporal());
+        return success();
       })
       .Case([&](vector::MaskedLoadOp op) {
         rewriter.replaceOpWithNewOp<vector::MaskedLoadOp>(
             op, op.getType(), expandShapeOp.getViewSource(), sourceIndices,
             op.getMask(), op.getPassThru());
+        return success();
+      })
+      .Case([&](vector::TransferReadOp op) {
+        // We only support minor identity maps in the permutation attribute.
+        if (!op.getPermutationMap().isMinorIdentity())
+          return failure();
+
+        // We only support the case where the source of the expand shape has
+        // rank greater than or equal to the vector rank.
+        const int64_t sourceRank = sourceIndices.size();
+        const int64_t vectorRank = op.getVectorType().getRank();
+        if (sourceRank < vectorRank)
+          return failure();
+
+        // We need to construct a new minor identity map since we will have lost
+        // some dimensions in folding away the expand shape.
+        auto minorIdMap = AffineMap::getMinorIdentityMap(sourceRank, vectorRank,
+                                                         op.getContext());
+
+        rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
+            op, op.getVectorType(), expandShapeOp.getViewSource(),
+            sourceIndices, minorIdMap, op.getPadding(), op.getMask(),
+            op.getInBounds());
+        return success();
       })
       .DefaultUnreachable("unexpected operation");
-  return success();
 }
 
 template <typename OpTy>
@@ -659,6 +686,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
                LoadOpOfExpandShapeOpFolder<memref::LoadOp>,
                LoadOpOfExpandShapeOpFolder<vector::LoadOp>,
                LoadOpOfExpandShapeOpFolder<vector::MaskedLoadOp>,
+               LoadOpOfExpandShapeOpFolder<vector::TransferReadOp>,
                StoreOpOfExpandShapeOpFolder<affine::AffineStoreOp>,
                StoreOpOfExpandShapeOpFolder<memref::StoreOp>,
                StoreOpOfExpandShapeOpFolder<vector::StoreOp>,
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 106652623933f..ca91b0141f593 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -992,6 +992,55 @@ func.func @fold_vector_maskedstore_expand_shape(
 
 // -----
 
+func.func @fold_vector_transfer_read_expand_shape(
+  %arg0 : memref<32xf32>, %arg1 : index) -> vector<8xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = ub.poison : f32
+  %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32>
+  %1 = vector.transfer_read %0[%arg1, %c0], %pad {in_bounds = [true]} : memref<4x8xf32>, vector<8xf32>
+  return %1 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @fold_vector_transfer_read_expand_shape
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[C0:.*]] = arith.constant 0
+//       CHECK:   %[[PAD:.*]] = ub.poison : f32
+//       CHECK:   %[[IDX:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (4, 8)
+//       CHECK:   vector.transfer_read %[[ARG0]][%[[IDX]]], %[[PAD]] {in_bounds = [true]}
+
+// -----
+
+func.func @fold_vector_transfer_read_with_perm_map(
+  %arg0 : memref<32xf32>, %arg1 : index) -> vector<4x4xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = ub.poison : f32
+  %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32>
+  %1 = vector.transfer_read %0[%arg1, %c0], %pad { permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<4x8xf32>, vector<4x4xf32>
+  return %1 : vector<4x4xf32>
+}
+
+// CHECK-LABEL: func @fold_vector_transfer_read_with_perm_map
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32>
+//       CHECK:   memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32>
+
+// -----
+
+func.func @fold_vector_transfer_read_rank_mismatch(
+  %arg0 : memref<32xf32>, %arg1 : index) -> vector<4x4xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = ub.poison : f32
+  %0 = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [2, 4, 4] : memref<32xf32> into memref<2x4x4xf32>
+  %1 = vector.transfer_read %0[%arg1, %c0, %c0], %pad {in_bounds = [true, true]} : memref<2x4x4xf32>, vector<4x4xf32>
+  return %1 : vector<4x4xf32>
+}
+
+// CHECK-LABEL: func @fold_vector_transfer_read_rank_mismatch
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32>
+//       CHECK:   memref.expand_shape %[[ARG0]] {{\[}}[0, 1, 2]] output_shape [2, 4, 4] : memref<32xf32> into memref<2x4x4xf32>
+
+// -----
+
 func.func @fold_vector_load_collapse_shape(
   %arg0 : memref<4x8xf32>, %arg1 : index) -> vector<8xf32> {
   %0 = memref.collapse_shape %arg0 [[0, 1]] : memref<4x8xf32> into memref<32xf32>

From a27842ce0698299eed4fbe076560b8d785d50444 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Mon, 24 Nov 2025 21:16:05 +0800
Subject: [PATCH 07/19] [X86][NFC] Add `-show-mc-encoding` to check register
 misuse (#169264)

---
 llvm/test/CodeGen/X86/apx/no-rex2-general.ll  | 70 +++++++++---------
 .../CodeGen/X86/apx/no-rex2-pseudo-amx.ll     | 16 ++--
 .../CodeGen/X86/apx/no-rex2-pseudo-x87.ll     | 22 +++---
 llvm/test/CodeGen/X86/apx/no-rex2-special.ll  | 74 +++++++++----------
 4 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
index 2b34739fa80e3..6f31aef9aee98 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) {
 ; CHECK-LABEL: map0:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %r16
-; CHECK-NEXT:    movq %rdi, %r17
+; CHECK-NEXT:    movq %rsi, %r16 # encoding: [0xd5,0x18,0x89,0xf0]
+; CHECK-NEXT:    movq %rdi, %r17 # encoding: [0xd5,0x18,0x89,0xf9]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl (%r17,%r16,4), %eax
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl (%r17,%r16,4), %eax # encoding: [0xd5,0x30,0x8b,0x04,0x81]
+; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
@@ -22,23 +22,23 @@ entry:
 define i32 @map1_or_vex(<2 x double> noundef %a) nounwind {
 ; SSE-LABEL: map1_or_vex:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    cvtsd2si %xmm0, %r16d
+; SSE-NEXT:    cvtsd2si %xmm0, %r16d # encoding: [0xf2,0xd5,0xc0,0x2d,0xc0]
 ; SSE-NEXT:    #APP
-; SSE-NEXT:    nop
+; SSE-NEXT:    nop # encoding: [0x90]
 ; SSE-NEXT:    #NO_APP
-; SSE-NEXT:    movl %r16d, %eax
-; SSE-NEXT:    retq
+; SSE-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; AVX-LABEL: map1_or_vex:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    vcvtsd2si %xmm0, %ebx
+; AVX-NEXT:    pushq %rbx # encoding: [0x53]
+; AVX-NEXT:    vcvtsd2si %xmm0, %ebx # encoding: [0xc5,0xfb,0x2d,0xd8]
 ; AVX-NEXT:    #APP
-; AVX-NEXT:    nop
+; AVX-NEXT:    nop # encoding: [0x90]
 ; AVX-NEXT:    #NO_APP
-; AVX-NEXT:    movl %ebx, %eax
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    retq
+; AVX-NEXT:    movl %ebx, %eax # encoding: [0x89,0xd8]
+; AVX-NEXT:    popq %rbx # encoding: [0x5b]
+; AVX-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a)
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
@@ -48,31 +48,31 @@ entry:
 define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind {
 ; SSE-LABEL: map2_or_vex:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %rsi, %rbx
-; SSE-NEXT:    movq %rdi, %r14
+; SSE-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; SSE-NEXT:    pushq %rbx # encoding: [0x53]
+; SSE-NEXT:    movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; SSE-NEXT:    movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
 ; SSE-NEXT:    #APP
-; SSE-NEXT:    nop
+; SSE-NEXT:    nop # encoding: [0x90]
 ; SSE-NEXT:    #NO_APP
-; SSE-NEXT:    pabsb (%r14,%rbx,4), %xmm0
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    retq
+; SSE-NEXT:    pabsb (%r14,%rbx,4), %xmm0 # encoding: [0x66,0x41,0x0f,0x38,0x1c,0x04,0x9e]
+; SSE-NEXT:    popq %rbx # encoding: [0x5b]
+; SSE-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; AVX-LABEL: map2_or_vex:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movq %rsi, %rbx
-; AVX-NEXT:    movq %rdi, %r14
+; AVX-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; AVX-NEXT:    pushq %rbx # encoding: [0x53]
+; AVX-NEXT:    movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; AVX-NEXT:    movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
 ; AVX-NEXT:    #APP
-; AVX-NEXT:    nop
+; AVX-NEXT:    nop # encoding: [0x90]
 ; AVX-NEXT:    #NO_APP
-; AVX-NEXT:    vpabsb (%r14,%rbx,4), %xmm0
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r14
-; AVX-NEXT:    retq
+; AVX-NEXT:    vpabsb (%r14,%rbx,4), %xmm0 # encoding: [0xc4,0xc2,0x79,0x1c,0x04,0x9e]
+; AVX-NEXT:    popq %rbx # encoding: [0x5b]
+; AVX-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; AVX-NEXT:    retq # encoding: [0xc3]
 entry:
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
index c193680607f76..a6ab98f8bf03e 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr --show-mc-encoding | FileCheck %s
 
 define dso_local void @amx(ptr noundef %data) nounwind {
 ; CHECK-LABEL: amx:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl $8, %eax
-; CHECK-NEXT:    tileloadd (%rbx,%rax), %tmm4
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl $8, %eax # encoding: [0xb8,0x08,0x00,0x00,0x00]
+; CHECK-NEXT:    tileloadd (%rbx,%rax), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x24,0x03]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8)
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
index 4692a58d095a6..e7bc0c362cad3 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr --show-mc-encoding | FileCheck %s
 
 define void @x87(ptr %0, ptr %1) nounwind {
 ; CHECK-LABEL: x87:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; CHECK-NEXT:    movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    flds (%r14)
-; CHECK-NEXT:    fstps (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    flds (%r14) # encoding: [0x41,0xd9,0x06]
+; CHECK-NEXT:    fstps (%rbx) # encoding: [0xd9,0x1b]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %3 = load float, ptr %0
   store float %3, ptr %1
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
index f2025b5c8cbf8..9b89bce283b15 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr --show-mc-encoding | FileCheck %s
 
 define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xsave:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xsave (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xsave (%rbx) # encoding: [0x0f,0xae,0x23]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
@@ -24,17 +24,17 @@ declare void @llvm.x86.xsave(ptr, i32, i32)
 define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xsave64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xsave64 (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xsave64 (%rbx) # encoding: [0x48,0x0f,0xae,0x23]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
@@ -44,17 +44,17 @@ declare void @llvm.x86.xsave64(ptr, i32, i32)
 define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xrstor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xrstor (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xrstor (%rbx) # encoding: [0x0f,0xae,0x2b]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
@@ -64,17 +64,17 @@ declare void @llvm.x86.xrstor(ptr, i32, i32)
 define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xrstor64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xrstor64 (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xrstor64 (%rbx) # encoding: [0x48,0x0f,0xae,0x2b]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;

From d14840779bf9e4ba80e8955b0e846d112106f287 Mon Sep 17 00:00:00 2001
From: Abhishek Kaushik <abhishek.kaushik@intel.com>
Date: Mon, 24 Nov 2025 21:32:26 +0800
Subject: [PATCH 08/19] [X86][AVX512] Add pseudos for `AVX512_*_SETALLONES`
 (#169009)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce `AVX512_128_SETALLONES`, `AVX512_256_SETALLONES` pseudos to
generate all-ones vectors.

Post-RA expansion:

- Use VEX vpcmpeqd for XMM/YMM0–15 when available (matches current
codegen as `AVX512_128/256_SETALLONES` will be preferred over
`AVX1/2_SETALLONES` for AVX512VL target).
- Use EVEX `vpternlogd imm=0xFF` for high regs.

Includes MIR tests for both VEX and EVEX paths.
---
 llvm/lib/Target/X86/X86InstrAVX512.td         |  6 ++++
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 32 ++++++++++++++++++-
 .../X86/avx512-i386-setallones-pseudo.mir     | 26 +++++++++++++++
 .../CodeGen/X86/avx512-setallones-pseudo.mir  | 30 +++++++++++++++++
 llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll  |  3 +-
 5 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
 create mode 100644 llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 70564973816b1..e8fda829e2394 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -300,6 +300,12 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16i32 immAllZerosV))]>;
 def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16i32 immAllOnesV))]>;
+let AddedComplexity = 1, Predicates = [HasVLX] in {
+  def AVX512_128_SETALLONES : I<0, Pseudo, (outs VR128X:$dst), (ins),
+                                "", [(set VR128X:$dst, (v4i32 immAllOnesV))]>;
+  def AVX512_256_SETALLONES : I<0, Pseudo, (outs VR256X:$dst), (ins),
+                                "", [(set VR256X:$dst, (v8i32 immAllOnesV))]>;
+}
 }
 
 let Predicates = [HasAVX512] in {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index cb0208a4a5f32..b988ae0aca912 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -778,6 +778,8 @@ bool X86InstrInfo::isReMaterializableImpl(
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
+  case X86::AVX512_128_SETALLONES:
+  case X86::AVX512_256_SETALLONES:
   case X86::AVX512_512_SETALLONES:
   case X86::AVX512_FsFLD0SD:
   case X86::AVX512_FsFLD0SH:
@@ -6246,9 +6248,31 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
     return true;
   }
+  case X86::AVX512_128_SETALLONES:
+  case X86::AVX512_256_SETALLONES:
   case X86::AVX512_512_SETALLONES: {
     Register Reg = MIB.getReg(0);
-    MIB->setDesc(get(X86::VPTERNLOGDZrri));
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case X86::AVX512_128_SETALLONES: {
+      if (X86::VR128RegClass.contains(Reg))
+        return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
+
+      Opc = X86::VPTERNLOGDZ128rri;
+      break;
+    }
+    case X86::AVX512_256_SETALLONES: {
+      if (X86::VR256RegClass.contains(Reg))
+        return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+
+      Opc = X86::VPTERNLOGDZ256rri;
+      break;
+    }
+    case X86::AVX512_512_SETALLONES:
+      Opc = X86::VPTERNLOGDZrri;
+      break;
+    }
+    MIB->setDesc(get(Opc));
     // VPTERNLOGD needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
     MIB.addReg(Reg, RegState::Undef)
@@ -8190,6 +8214,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
+    case X86::AVX512_256_SETALLONES:
       Alignment = Align(32);
       break;
     case X86::V_SET0:
@@ -8197,6 +8222,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX512_128_SET0:
     case X86::FsFLD0F128:
     case X86::AVX512_FsFLD0F128:
+    case X86::AVX512_128_SETALLONES:
       Alignment = Align(16);
       break;
     case X86::MMX_SET0:
@@ -8255,6 +8281,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
+  case X86::AVX512_128_SETALLONES:
+  case X86::AVX512_256_SETALLONES:
   case X86::AVX512_512_SETALLONES:
   case X86::FsFLD0SH:
   case X86::AVX512_FsFLD0SH:
@@ -8315,6 +8343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       break;
     case X86::AVX1_SETALLONES:
     case X86::AVX2_SETALLONES:
+    case X86::AVX512_256_SETALLONES:
       IsAllOnes = true;
       [[fallthrough]];
     case X86::AVX512_256_SET0:
@@ -8328,6 +8357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
                                 2);
       break;
     case X86::V_SETALLONES:
+    case X86::AVX512_128_SETALLONES:
       IsAllOnes = true;
       [[fallthrough]];
     case X86::V_SET0:
diff --git a/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
new file mode 100644
index 0000000000000..0d8f2177aaa30
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=i386-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+  target triple = "i386-unknown-unknown"
+
+  define void @setallones() #0 {
+  ; CHECK-LABEL: setallones:
+  ; CHECK:       # %bb.0:
+  ; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+  ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+  entry:
+  unreachable
+  }
+
+  attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name:            setallones
+tracksRegLiveness: true
+liveins: []
+body:             |
+  bb.0:
+    $xmm0 = AVX512_128_SETALLONES
+    $ymm1 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
new file mode 100644
index 0000000000000..7e5ddc4cd632f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=x86_64-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+  target triple = "x86_64-unknown-unknown"
+
+  define void @setallones() #0 {
+  ; CHECK-LABEL: setallones:
+  ; CHECK:       # %bb.0:
+  ; CHECK-NEXT:    vpcmpeqd %xmm14, %xmm14, %xmm14
+  ; CHECK-NEXT:    vpternlogd {{.*#+}} xmm16 = -1
+  ; CHECK-NEXT:    vpcmpeqd %ymm15, %ymm15, %ymm15
+  ; CHECK-NEXT:    vpternlogd {{.*#+}} ymm17 = -1
+  entry:
+  unreachable
+  }
+
+  attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name:            setallones
+tracksRegLiveness: true
+liveins: []
+body:             |
+  bb.0:
+    $xmm14 = AVX512_128_SETALLONES
+    $xmm16 = AVX512_128_SETALLONES
+    $ymm15 = AVX512_256_SETALLONES
+    $ymm17 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
index 3243d950740ca..e2400fbe2c4ff 100644
--- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
+++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
@@ -106,7 +106,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $24, %rsp
 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
-; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    callq use.v4.i32@PLT
 ; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload

From 83765f435d1ca1ffc29ebe0ad979bfb70a22ff70 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Mon, 24 Nov 2025 13:38:41 +0000
Subject: [PATCH 09/19] [Utils][update_mc_test_checks] Support generating asm
 tests from templates. (#168946)

Reduces the pain of manual editing tests applying the same
changes over multiple instructions and keeping them consistent.
---
 llvm/test/MC/AMDGPU/gfx11_asm_vop1.s          | 622 +++++++++++++-----
 .../Inputs/amdgpu-templates.s                 |  17 +
 .../Inputs/amdgpu-templates.s.expected        |  32 +
 .../amdgpu-templates.test                     |   5 +
 llvm/utils/update_mc_test_checks.py           | 104 ++-
 5 files changed, 621 insertions(+), 159 deletions(-)
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test

diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index f1438532d7c5e..5b4689b2954df 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -1,8 +1,195 @@
-// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
-v_bfrev_b32_e32 v5, v1
+//  INSTS=
+//      v_bfrev_b32 OPS32
+//      v_ceil_f16 OPS16
+//      v_ceil_f32 OPS32
+//      v_ceil_f64 OPS64
+//      v_cls_i32 OPS32
+//      v_clz_i32_u32 OPS32
+//      v_cos_f16 OPS16
+//      v_cos_f32 OPS32
+//      v_ctz_i32_b32 OPS32
+//      v_cvt_f16_f32 v5.l, SRC32
+//      v_cvt_f16_f32 v127.h, 0xaf123456
+//      v_cvt_f16_f32 v127.l, 0.5
+//      v_cvt_f16_i16 OPS16
+//      v_cvt_f16_u16 OPS16
+//      v_cvt_f32_f16 OPS_32_16
+//      v_cvt_f32_f64 OPS_32_64
+//      v_cvt_f32_i32 OPS32
+//      v_cvt_f32_u32 OPS32
+//      v_cvt_f32_ubyte0 OPS32
+//      v_cvt_f32_ubyte1 OPS32
+//      v_cvt_f32_ubyte2 OPS32
+//      v_cvt_f32_ubyte3 OPS32
+//      v_cvt_f64_f32 OPS_64_32
+//      v_cvt_f64_i32 OPS_64_32
+//      v_cvt_f64_u32 OPS_64_32
+//      v_cvt_floor_i32_f32 OPS32
+//      v_cvt_flr_i32_f32 OPS32
+//      v_cvt_i16_f16 OPS16
+//      v_cvt_i32_f32 OPS32
+//      v_cvt_i32_f64 OPS_32_64
+//      v_cvt_i32_i16 OPS_32_16
+//      v_cvt_nearest_i32_f32 OPS32
+//      v_cvt_norm_i16_f16 OPS16
+//      v_cvt_norm_u16_f16 OPS16
+//      v_cvt_off_f32_i4 v5, SRC32
+//      v_cvt_off_f32_i4 v255, 0x4f
+//      v_cvt_rpi_i32_f32 OPS32
+//      v_cvt_u16_f16 OPS16
+//      v_cvt_u32_f32 OPS32
+//      v_cvt_u32_f64 OPS_32_64
+//      v_cvt_u32_u16 OPS_32_16
+//      v_exp_f16 OPS16
+//      v_exp_f32 OPS32
+//      v_ffbh_i32 OPS32
+//      v_ffbh_u32 OPS32
+//      v_ffbl_b32 OPS32
+//      v_floor_f16 OPS16
+//      v_floor_f32 OPS32
+//      v_floor_f64 OPS64
+//      v_fract_f16 OPS16
+//      v_fract_f32 OPS32
+//      v_fract_f64 OPS64
+//      v_frexp_exp_i16_f16 OPS16
+//      v_frexp_exp_i32_f32 OPS32
+//      v_frexp_exp_i32_f64 OPS_32_64
+//      v_frexp_mant_f16 OPS16
+//      v_frexp_mant_f32 OPS32
+//      v_frexp_mant_f64 OPS64
+//      v_log_f16 OPS16
+//      v_log_f32 OPS32
+//      v_mov_b16_e32 OPS16
+//      v_mov_b16_e64 OPS16
+//      v_mov_b32 OPS32
+//      v_movreld_b32 OPS32
+//      v_movrels_b32 v5, v1
+//      v_movrels_b32 v255, v255
+//      v_movrelsd_2_b32 v5, v1
+//      v_movrelsd_2_b32 v255, v255
+//      v_movrelsd_b32 v5, v1
+//      v_movrelsd_b32 v255, v255
+//      v_nop
+//      v_not_b16 OPS16
+//      v_not_b32 OPS32
+//      v_permlane64_b32 v5, v1
+//      v_permlane64_b32 v255, v255
+//      v_pipeflush
+//      v_rcp_f16 OPS16
+//      v_rcp_f32 OPS32
+//      v_rcp_f64 OPS64
+//      v_rcp_iflag_f32 OPS32
+//      v_readfirstlane_b32 s5, v1
+//      v_readfirstlane_b32 s105, v1
+//      v_readfirstlane_b32 vcc_lo, v1
+//      v_readfirstlane_b32 vcc_hi, v1
+//      v_readfirstlane_b32 ttmp15, v1
+//      v_readfirstlane_b32 null, v255
+//      v_rndne_f16 OPS16
+//      v_rndne_f32 OPS32
+//      v_rndne_f64 OPS64
+//      v_rsq_f16 OPS16
+//      v_rsq_f32 OPS32
+//      v_rsq_f64 OPS64
+//      v_sat_pk_u8_i16 v5.l, SRC32
+//      v_sat_pk_u8_i16 v127.l, 0xfe0b
+//      v_sat_pk_u8_i16 v127.l, 0.5
+//      v_sat_pk_u8_i16 v5.h, src_scc
+//      v_sat_pk_u8_i16 v127.h, 0xfe0b
+//      v_sin_f16 OPS16
+//      v_sin_f32 OPS32
+//      v_sqrt_f16 OPS16
+//      v_sqrt_f32 OPS32
+//      v_sqrt_f64 OPS64
+//      v_swap_b16 v5.l, v1.h
+//      v_swap_b16 v5.h, v1.l
+//      v_swap_b16 v127.l, v127.l
+//      v_swap_b32 v5, v1
+//      v_swap_b32 v255, v255
+//      v_swaprel_b32 v5, v1
+//      v_swaprel_b32 v255, v255
+//      v_trunc_f16 OPS16
+//      v_trunc_f32 OPS32
+//      v_trunc_f64 OPS64
+//
+//  SRC16=
+//      v1.l
+//      v127.l
+//      v1.h
+//      v127.h
+//      s1
+//      s105
+//      vcc_lo
+//      vcc_hi
+//      ttmp15
+//      m0
+//      exec_lo
+//      exec_hi
+//      null
+//      -1
+//      0.5
+//      src_scc
+//
+//  OPS16=
+//      v5.l, SRC16
+//      v5.l, 0xfe0b
+//      v5.h, src_scc
+//      v127.h, 0xfe0b
+//
+//  SRC32=
+//      v1
+//      v255
+//      s1
+//      s105
+//      vcc_lo
+//      vcc_hi
+//      ttmp15
+//      m0
+//      exec_lo
+//      exec_hi
+//      null
+//      -1
+//      0.5
+//      src_scc
+//
+//  OPS32=
+//      v5, SRC32
+//      v255, 0xaf123456
+//
+//  SRC64=
+//      v[1:2]
+//      v[254:255]
+//      s[2:3]
+//      s[104:105]
+//      vcc
+//      ttmp[14:15]
+//      exec
+//      null
+//      -1
+//      0.5
+//      src_scc
+//
+//  OPS64=
+//      v[5:6], SRC64
+//      v[254:255], 0xaf123456
+//
+//  OPS_32_16=
+//      v5, SRC16
+//      v255, 0xfe0b
+//
+//  OPS_32_64=
+//      v5, SRC64
+//      v255, 0xaf123456
+//
+//  OPS_64_32=
+//      v[5:6], SRC32
+//      v[254:255], 0xaf123456
+
+v_bfrev_b32 v5, v1
 // GFX11: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
 
 v_bfrev_b32 v5, v255
@@ -89,8 +276,14 @@ v_ceil_f16 v5.l, null
 v_ceil_f16 v5.l, -1
 // GFX11: v_ceil_f16_e32 v5.l, -1                 ; encoding: [0xc1,0xb8,0x0a,0x7e]
 
-v_ceil_f16 v127.l, 0.5
-// GFX11: v_ceil_f16_e32 v127.l, 0.5              ; encoding: [0xf0,0xb8,0xfe,0x7e]
+v_ceil_f16 v5.l, 0.5
+// GFX11: v_ceil_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5.l, src_scc
+// GFX11: v_ceil_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5.l, 0xfe0b
+// GFX11: v_ceil_f16_e32 v5.l, 0xfe0b             ; encoding: [0xff,0xb8,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_ceil_f16 v5.h, src_scc
 // GFX11: v_ceil_f16_e32 v5.h, src_scc            ; encoding: [0xfd,0xb8,0x0a,0x7f]
@@ -275,6 +468,12 @@ v_cos_f16 v5.l, v1.l
 v_cos_f16 v5.l, v127.l
 // GFX11: v_cos_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
+v_cos_f16 v5.l, v1.h
+// GFX11: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.l, v127.h
+// GFX11: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
+
 v_cos_f16 v5.l, s1
 // GFX11: v_cos_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc2,0x0a,0x7e]
 
@@ -311,17 +510,8 @@ v_cos_f16 v5.l, 0.5
 v_cos_f16 v5.l, src_scc
 // GFX11: v_cos_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
-v_cos_f16 v127.l, 0xfe0b
-// GFX11: v_cos_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_cos_f16 v5.l, v1.h
-// GFX11: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
-
-v_cos_f16 v5.l, v127.h
-// GFX11: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
-
-v_cos_f16 v127.l, 0.5
-// GFX11: v_cos_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc2,0xfe,0x7e]
+v_cos_f16 v5.l, 0xfe0b
+// GFX11: v_cos_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xc2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cos_f16 v5.h, src_scc
 // GFX11: v_cos_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7f]
@@ -458,8 +648,8 @@ v_cvt_f16_f32 v5.l, -1
 v_cvt_f16_f32 v5.l, 0.5
 // GFX11: v_cvt_f16_f32_e32 v5.l, 0.5             ; encoding: [0xf0,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v5.h, src_scc
-// GFX11: v_cvt_f16_f32_e32 v5.h, src_scc         ; encoding: [0xfd,0x14,0x0a,0x7f]
+v_cvt_f16_f32 v5.l, src_scc
+// GFX11: v_cvt_f16_f32_e32 v5.l, src_scc         ; encoding: [0xfd,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v127.h, 0xaf123456
 // GFX11: v_cvt_f16_f32_e32 v127.h, 0xaf123456    ; encoding: [0xff,0x14,0xfe,0x7f,0x56,0x34,0x12,0xaf]
@@ -509,12 +699,15 @@ v_cvt_f16_i16 v5.l, null
 v_cvt_f16_i16 v5.l, -1
 // GFX11: v_cvt_f16_i16_e32 v5.l, -1              ; encoding: [0xc1,0xa2,0x0a,0x7e]
 
-v_cvt_f16_i16 v127.l, 0.5
-// GFX11: v_cvt_f16_i16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa2,0xfe,0x7e]
-
 v_cvt_f16_i16 v5.l, 0.5
 // GFX11: v_cvt_f16_i16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa2,0x0a,0x7e]
 
+v_cvt_f16_i16 v5.l, src_scc
+// GFX11: v_cvt_f16_i16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5.l, 0xfe0b
+// GFX11: v_cvt_f16_i16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
 v_cvt_f16_i16 v5.h, src_scc
 // GFX11: v_cvt_f16_i16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa2,0x0a,0x7f]
 
@@ -563,11 +756,14 @@ v_cvt_f16_u16 v5.l, null
 v_cvt_f16_u16 v5.l, -1
 // GFX11: v_cvt_f16_u16_e32 v5.l, -1              ; encoding: [0xc1,0xa0,0x0a,0x7e]
 
-v_cvt_f16_u16 v127.l, 0.5
-// GFX11: v_cvt_f16_u16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa0,0xfe,0x7e]
+v_cvt_f16_u16 v5.l, 0.5
+// GFX11: v_cvt_f16_u16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5.l, src_scc
+// GFX11: v_cvt_f16_u16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa0,0x0a,0x7e]
 
-v_cvt_f16_u16 v5, 0.5
-// GFX11: v_cvt_f16_u16_e32 v5, 0.5               ; encoding: [0xf0,0xa0,0x0a,0x7e]
+v_cvt_f16_u16 v5.l, 0xfe0b
+// GFX11: v_cvt_f16_u16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f16_u16 v5.h, src_scc
 // GFX11: v_cvt_f16_u16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa0,0x0a,0x7f]
@@ -1199,8 +1395,14 @@ v_cvt_i16_f16 v5.l, null
 v_cvt_i16_f16 v5.l, -1
 // GFX11: v_cvt_i16_f16_e32 v5.l, -1              ; encoding: [0xc1,0xa6,0x0a,0x7e]
 
-v_cvt_i16_f16 v127.l, 0.5
-// GFX11: v_cvt_i16_f16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa6,0xfe,0x7e]
+v_cvt_i16_f16 v5.l, 0.5
+// GFX11: v_cvt_i16_f16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5.l, src_scc
+// GFX11: v_cvt_i16_f16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_i16_f16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa6,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_i16_f16 v5.h, src_scc
 // GFX11: v_cvt_i16_f16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa6,0x0a,0x7f]
@@ -1295,6 +1497,12 @@ v_cvt_i32_i16 v5, v1.l
 v_cvt_i32_i16 v5, v127.l
 // GFX11: v_cvt_i32_i16_e32 v5, v127.l            ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
+v_cvt_i32_i16 v5, v1.h
+// GFX11: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127.h
+// GFX11: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
+
 v_cvt_i32_i16 v5, s1
 // GFX11: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
 
@@ -1334,12 +1542,6 @@ v_cvt_i32_i16 v5, src_scc
 v_cvt_i32_i16 v255, 0xfe0b
 // GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_cvt_i32_i16 v5, v1.h
-// GFX11: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
-
-v_cvt_i32_i16 v5, v127.h
-// GFX11: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
-
 v_cvt_nearest_i32_f32 v5, v1
 // GFX11: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1427,8 +1629,14 @@ v_cvt_norm_i16_f16 v5.l, null
 v_cvt_norm_i16_f16 v5.l, -1
 // GFX11: v_cvt_norm_i16_f16_e32 v5.l, -1         ; encoding: [0xc1,0xc6,0x0a,0x7e]
 
-v_cvt_norm_i16_f16 v127.l, 0.5
-// GFX11: v_cvt_norm_i16_f16_e32 v127.l, 0.5      ; encoding: [0xf0,0xc6,0xfe,0x7e]
+v_cvt_norm_i16_f16 v5.l, 0.5
+// GFX11: v_cvt_norm_i16_f16_e32 v5.l, 0.5        ; encoding: [0xf0,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5.l, src_scc
+// GFX11: v_cvt_norm_i16_f16_e32 v5.l, src_scc    ; encoding: [0xfd,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_norm_i16_f16_e32 v5.l, 0xfe0b     ; encoding: [0xff,0xc6,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_i16_f16 v5.h, src_scc
 // GFX11: v_cvt_norm_i16_f16_e32 v5.h, src_scc    ; encoding: [0xfd,0xc6,0x0a,0x7f]
@@ -1478,8 +1686,14 @@ v_cvt_norm_u16_f16 v5.l, null
 v_cvt_norm_u16_f16 v5.l, -1
 // GFX11: v_cvt_norm_u16_f16_e32 v5.l, -1         ; encoding: [0xc1,0xc8,0x0a,0x7e]
 
-v_cvt_norm_u16_f16 v127.l, 0.5
-// GFX11: v_cvt_norm_u16_f16_e32 v127.l, 0.5      ; encoding: [0xf0,0xc8,0xfe,0x7e]
+v_cvt_norm_u16_f16 v5.l, 0.5
+// GFX11: v_cvt_norm_u16_f16_e32 v5.l, 0.5        ; encoding: [0xf0,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5.l, src_scc
+// GFX11: v_cvt_norm_u16_f16_e32 v5.l, src_scc    ; encoding: [0xfd,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_norm_u16_f16_e32 v5.l, 0xfe0b     ; encoding: [0xff,0xc8,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_u16_f16 v5.h, src_scc
 // GFX11: v_cvt_norm_u16_f16_e32 v5.h, src_scc    ; encoding: [0xfd,0xc8,0x0a,0x7f]
@@ -1619,8 +1833,14 @@ v_cvt_u16_f16 v5.l, null
 v_cvt_u16_f16 v5.l, -1
 // GFX11: v_cvt_u16_f16_e32 v5.l, -1              ; encoding: [0xc1,0xa4,0x0a,0x7e]
 
-v_cvt_u16_f16 v127.l, 0.5
-// GFX11: v_cvt_u16_f16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa4,0xfe,0x7e]
+v_cvt_u16_f16 v5.l, 0.5
+// GFX11: v_cvt_u16_f16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5.l, src_scc
+// GFX11: v_cvt_u16_f16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_u16_f16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa4,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_u16_f16 v5.h, src_scc
 // GFX11: v_cvt_u16_f16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa4,0x0a,0x7f]
@@ -1715,6 +1935,12 @@ v_cvt_u32_u16 v5, v1.l
 v_cvt_u32_u16 v5, v127.l
 // GFX11: v_cvt_u32_u16_e32 v5, v127.l            ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
+v_cvt_u32_u16 v5, v1.h
+// GFX11: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127.h
+// GFX11: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
+
 v_cvt_u32_u16 v5, s1
 // GFX11: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
 
@@ -1754,12 +1980,6 @@ v_cvt_u32_u16 v5, src_scc
 v_cvt_u32_u16 v255, 0xfe0b
 // GFX11: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1.h
-// GFX11: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
-
-v_cvt_u32_u16 v5, v127.h
-// GFX11: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
-
 v_exp_f16 v5.l, v1.l
 // GFX11: v_exp_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xb1,0x0a,0x7e]
 
@@ -1802,8 +2022,14 @@ v_exp_f16 v5.l, null
 v_exp_f16 v5.l, -1
 // GFX11: v_exp_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xb0,0x0a,0x7e]
 
-v_exp_f16 v127.l, 0.5
-// GFX11: v_exp_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xb0,0xfe,0x7e]
+v_exp_f16 v5.l, 0.5
+// GFX11: v_exp_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5.l, src_scc
+// GFX11: v_exp_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5.l, 0xfe0b
+// GFX11: v_exp_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xb0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_exp_f16 v5.h, src_scc
 // GFX11: v_exp_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xb0,0x0a,0x7f]
@@ -2033,8 +2259,14 @@ v_floor_f16 v5.l, null
 v_floor_f16 v5.l, -1
 // GFX11: v_floor_f16_e32 v5.l, -1                ; encoding: [0xc1,0xb6,0x0a,0x7e]
 
-v_floor_f16 v127.l, 0.5
-// GFX11: v_floor_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xb6,0xfe,0x7e]
+v_floor_f16 v5.l, 0.5
+// GFX11: v_floor_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5.l, src_scc
+// GFX11: v_floor_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5.l, 0xfe0b
+// GFX11: v_floor_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xb6,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_floor_f16 v5.h, src_scc
 // GFX11: v_floor_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xb6,0x0a,0x7f]
@@ -2129,6 +2361,12 @@ v_fract_f16 v5.l, v1.l
 v_fract_f16 v5.l, v127.l
 // GFX11: v_fract_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
+v_fract_f16 v5.l, v1.h
+// GFX11: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.h
+// GFX11: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
+
 v_fract_f16 v5.l, s1
 // GFX11: v_fract_f16_e32 v5.l, s1                ; encoding: [0x01,0xbe,0x0a,0x7e]
 
@@ -2165,17 +2403,8 @@ v_fract_f16 v5.l, 0.5
 v_fract_f16 v5.l, src_scc
 // GFX11: v_fract_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
-v_fract_f16 v127.l, 0xfe0b
-// GFX11: v_fract_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_fract_f16 v5.l, v1.h
-// GFX11: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
-
-v_fract_f16 v5.l, v127.h
-// GFX11: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
-
-v_fract_f16 v127.l, 0.5
-// GFX11: v_fract_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbe,0xfe,0x7e]
+v_fract_f16 v5.l, 0xfe0b
+// GFX11: v_fract_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xbe,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_fract_f16 v5.h, src_scc
 // GFX11: v_fract_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7f]
@@ -2306,8 +2535,14 @@ v_frexp_exp_i16_f16 v5.l, null
 v_frexp_exp_i16_f16 v5.l, -1
 // GFX11: v_frexp_exp_i16_f16_e32 v5.l, -1        ; encoding: [0xc1,0xb4,0x0a,0x7e]
 
-v_frexp_exp_i16_f16 v127.l, 0.5
-// GFX11: v_frexp_exp_i16_f16_e32 v127.l, 0.5     ; encoding: [0xf0,0xb4,0xfe,0x7e]
+v_frexp_exp_i16_f16 v5.l, 0.5
+// GFX11: v_frexp_exp_i16_f16_e32 v5.l, 0.5       ; encoding: [0xf0,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5.l, src_scc
+// GFX11: v_frexp_exp_i16_f16_e32 v5.l, src_scc   ; encoding: [0xfd,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5.l, 0xfe0b
+// GFX11: v_frexp_exp_i16_f16_e32 v5.l, 0xfe0b    ; encoding: [0xff,0xb4,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_frexp_exp_i16_f16 v5.h, src_scc
 // GFX11: v_frexp_exp_i16_f16_e32 v5.h, src_scc   ; encoding: [0xfd,0xb4,0x0a,0x7f]
@@ -2402,6 +2637,12 @@ v_frexp_mant_f16 v5.l, v1.l
 v_frexp_mant_f16 v5.l, v127.l
 // GFX11: v_frexp_mant_f16_e32 v5.l, v127.l       ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
+v_frexp_mant_f16 v5.l, v1.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
+
 v_frexp_mant_f16 v5.l, s1
 // GFX11: v_frexp_mant_f16_e32 v5.l, s1           ; encoding: [0x01,0xb2,0x0a,0x7e]
 
@@ -2438,17 +2679,8 @@ v_frexp_mant_f16 v5.l, 0.5
 v_frexp_mant_f16 v5.l, src_scc
 // GFX11: v_frexp_mant_f16_e32 v5.l, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v127.l, 0xfe0b
-// GFX11: v_frexp_mant_f16_e32 v127.l, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_frexp_mant_f16 v5.l, v1.h
-// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
-
-v_frexp_mant_f16 v5.l, v127.h
-// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
-
-v_frexp_mant_f16 v127.l, 0.5
-// GFX11: v_frexp_mant_f16_e32 v127.l, 0.5        ; encoding: [0xf0,0xb2,0xfe,0x7e]
+v_frexp_mant_f16 v5.l, 0xfe0b
+// GFX11: v_frexp_mant_f16_e32 v5.l, 0xfe0b       ; encoding: [0xff,0xb2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f16 v5.h, src_scc
 // GFX11: v_frexp_mant_f16_e32 v5.h, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7f]
@@ -2579,8 +2811,14 @@ v_log_f16 v5.l, null
 v_log_f16 v5.l, -1
 // GFX11: v_log_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xae,0x0a,0x7e]
 
-v_log_f16 v127.l, 0.5
-// GFX11: v_log_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xae,0xfe,0x7e]
+v_log_f16 v5.l, 0.5
+// GFX11: v_log_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xae,0x0a,0x7e]
+
+v_log_f16 v5.l, src_scc
+// GFX11: v_log_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xae,0x0a,0x7e]
+
+v_log_f16 v5.l, 0xfe0b
+// GFX11: v_log_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xae,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_log_f16 v5.h, src_scc
 // GFX11: v_log_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xae,0x0a,0x7f]
@@ -2633,35 +2871,119 @@ v_log_f32 v5, src_scc
 v_log_f32 v255, 0xaf123456
 // GFX11: v_log_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_mov_b16_e32 v0.l, v1.l
-// GFX11: v_mov_b16_e32 v0.l, v1.l                ; encoding: [0x01,0x39,0x00,0x7e]
+v_mov_b16_e32 v5.l, v1.l
+// GFX11: v_mov_b16_e32 v5.l, v1.l                ; encoding: [0x01,0x39,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, v127.l
+// GFX11: v_mov_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0x39,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, v1.h
+// GFX11: v_mov_b16_e32 v5.l, v1.h                ; encoding: [0x81,0x39,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, v127.h
+// GFX11: v_mov_b16_e32 v5.l, v127.h              ; encoding: [0xff,0x39,0x0a,0x7e]
 
-v_mov_b16_e32 v0.l, s1
-// GFX11: v_mov_b16_e32 v0.l, s1                  ; encoding: [0x01,0x38,0x00,0x7e]
+v_mov_b16_e32 v5.l, s1
+// GFX11: v_mov_b16_e32 v5.l, s1                  ; encoding: [0x01,0x38,0x0a,0x7e]
 
-v_mov_b16_e32 v0.h, 0
-// GFX11: v_mov_b16_e32 v0.h, 0                   ; encoding: [0x80,0x38,0x00,0x7f]
+v_mov_b16_e32 v5.l, s105
+// GFX11: v_mov_b16_e32 v5.l, s105                ; encoding: [0x69,0x38,0x0a,0x7e]
 
-v_mov_b16_e32 v0.h, 1.0
-// GFX11: v_mov_b16_e32 v0.h, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7f]
+v_mov_b16_e32 v5.l, vcc_lo
+// GFX11: v_mov_b16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0x38,0x0a,0x7e]
 
-v_mov_b16_e32 v0.l, 0x1234
-// GFX11: v_mov_b16_e32 v0.l, 0x1234              ; encoding: [0xff,0x38,0x00,0x7e,0x34,0x12,0x00,0x00]
+v_mov_b16_e32 v5.l, vcc_hi
+// GFX11: v_mov_b16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v0.l, v1.l
-// GFX11: v_mov_b16_e64 v0.l, v1.l                ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x01,0x00,0x00]
+v_mov_b16_e32 v5.l, ttmp15
+// GFX11: v_mov_b16_e32 v5.l, ttmp15              ; encoding: [0x7b,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v200.l, v1.h
-// GFX11: v_mov_b16_e64 v200.l, v1.h op_sel:[1,0] ; encoding: [0xc8,0x08,0x9c,0xd5,0x01,0x01,0x00,0x00]
+v_mov_b16_e32 v5.l, m0
+// GFX11: v_mov_b16_e32 v5.l, m0                  ; encoding: [0x7d,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v0.l, s1
-// GFX11: v_mov_b16_e64 v0.l, s1                  ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x00,0x00,0x00]
+v_mov_b16_e32 v5.l, exec_lo
+// GFX11: v_mov_b16_e32 v5.l, exec_lo             ; encoding: [0x7e,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v200.h, 1
-// GFX11: v_mov_b16_e64 v200.h, 1 op_sel:[0,1]    ; encoding: [0xc8,0x40,0x9c,0xd5,0x81,0x00,0x00,0x00]
+v_mov_b16_e32 v5.l, exec_hi
+// GFX11: v_mov_b16_e32 v5.l, exec_hi             ; encoding: [0x7f,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v0.l, 0x1234
-// GFX11: v_mov_b16_e64 v0.l, 0x1234              ; encoding: [0x00,0x00,0x9c,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+v_mov_b16_e32 v5.l, null
+// GFX11: v_mov_b16_e32 v5.l, null                ; encoding: [0x7c,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, -1
+// GFX11: v_mov_b16_e32 v5.l, -1                  ; encoding: [0xc1,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, 0.5
+// GFX11: v_mov_b16_e32 v5.l, 0.5                 ; encoding: [0xf0,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, src_scc
+// GFX11: v_mov_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, 0xfe0b
+// GFX11: v_mov_b16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0x38,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_mov_b16_e32 v5.h, src_scc
+// GFX11: v_mov_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0x38,0x0a,0x7f]
+
+v_mov_b16_e32 v127.h, 0xfe0b
+// GFX11: v_mov_b16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0x38,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v1.l
+// GFX11: v_mov_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0x9c,0xd5,0x01,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v127.l
+// GFX11: v_mov_b16_e64 v5.l, v127.l              ; encoding: [0x05,0x00,0x9c,0xd5,0x7f,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v1.h
+// GFX11: v_mov_b16_e64 v5.l, v1.h op_sel:[1,0]   ; encoding: [0x05,0x08,0x9c,0xd5,0x01,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v127.h
+// GFX11: v_mov_b16_e64 v5.l, v127.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9c,0xd5,0x7f,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, s1
+// GFX11: v_mov_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0x9c,0xd5,0x01,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, s105
+// GFX11: v_mov_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0x9c,0xd5,0x69,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, vcc_lo
+// GFX11: v_mov_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0x9c,0xd5,0x6a,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, vcc_hi
+// GFX11: v_mov_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0x9c,0xd5,0x6b,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, ttmp15
+// GFX11: v_mov_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0x9c,0xd5,0x7b,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, m0
+// GFX11: v_mov_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0x9c,0xd5,0x7d,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, exec_lo
+// GFX11: v_mov_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0x9c,0xd5,0x7e,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, exec_hi
+// GFX11: v_mov_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0x9c,0xd5,0x7f,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, null
+// GFX11: v_mov_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0x9c,0xd5,0x7c,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, -1
+// GFX11: v_mov_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0x9c,0xd5,0xc1,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, 0.5
+// GFX11: v_mov_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0x9c,0xd5,0xf0,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, src_scc
+// GFX11: v_mov_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0x9c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, 0xfe0b
+// GFX11: v_mov_b16_e64 v5.l, 0xfe0b              ; encoding: [0x05,0x00,0x9c,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_mov_b16_e64 v5.h, src_scc
+// GFX11: v_mov_b16_e64 v5.h, src_scc op_sel:[0,1] ; encoding: [0x05,0x40,0x9c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_mov_b16_e64 v127.h, 0xfe0b
+// GFX11: v_mov_b16_e64 v127.h, 0xfe0b op_sel:[0,1] ; encoding: [0x7f,0x40,0x9c,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_mov_b32 v5, v1
 // GFX11: v_mov_b32_e32 v5, v1                    ; encoding: [0x01,0x03,0x0a,0x7e]
@@ -2780,6 +3102,12 @@ v_not_b16 v5.l, v1.l
 v_not_b16 v5.l, v127.l
 // GFX11: v_not_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
+v_not_b16 v5.l, v1.h
+// GFX11: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.h
+// GFX11: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
+
 v_not_b16 v5.l, s1
 // GFX11: v_not_b16_e32 v5.l, s1                  ; encoding: [0x01,0xd2,0x0a,0x7e]
 
@@ -2816,17 +3144,8 @@ v_not_b16 v5.l, 0.5
 v_not_b16 v5.l, src_scc
 // GFX11: v_not_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
-v_not_b16 v127.l, 0xfe0b
-// GFX11: v_not_b16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_not_b16 v5.l, v1.h
-// GFX11: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
-
-v_not_b16 v5.l, v127.h
-// GFX11: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
-
-v_not_b16 v127.l, 0.5
-// GFX11: v_not_b16_e32 v127.l, 0.5               ; encoding: [0xf0,0xd2,0xfe,0x7e]
+v_not_b16 v5.l, 0xfe0b
+// GFX11: v_not_b16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xd2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_not_b16 v5.h, src_scc
 // GFX11: v_not_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7f]
@@ -2930,8 +3249,14 @@ v_rcp_f16 v5.l, null
 v_rcp_f16 v5.l, -1
 // GFX11: v_rcp_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xa8,0x0a,0x7e]
 
-v_rcp_f16 v127.l, 0.5
-// GFX11: v_rcp_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xa8,0xfe,0x7e]
+v_rcp_f16 v5.l, 0.5
+// GFX11: v_rcp_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5.l, src_scc
+// GFX11: v_rcp_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5.l, 0xfe0b
+// GFX11: v_rcp_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xa8,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rcp_f16 v5.h, src_scc
 // GFX11: v_rcp_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xa8,0x0a,0x7f]
@@ -3089,6 +3414,12 @@ v_rndne_f16 v5.l, v1.l
 v_rndne_f16 v5.l, v127.l
 // GFX11: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
+v_rndne_f16 v5.l, v1.h
+// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.h
+// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+
 v_rndne_f16 v5.l, s1
 // GFX11: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
 
@@ -3125,17 +3456,8 @@ v_rndne_f16 v5.l, 0.5
 v_rndne_f16 v5.l, src_scc
 // GFX11: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v127.l, 0xfe0b
-// GFX11: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_rndne_f16 v5.l, v1.h
-// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
-
-v_rndne_f16 v5.l, v127.h
-// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
-
-v_rndne_f16 v127.l, 0.5
-// GFX11: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+v_rndne_f16 v5.l, 0xfe0b
+// GFX11: v_rndne_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xbc,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f16 v5.h, src_scc
 // GFX11: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
@@ -3266,8 +3588,14 @@ v_rsq_f16 v5.l, null
 v_rsq_f16 v5.l, -1
 // GFX11: v_rsq_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xac,0x0a,0x7e]
 
-v_rsq_f16 v127.l, 0.5
-// GFX11: v_rsq_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xac,0xfe,0x7e]
+v_rsq_f16 v5.l, 0.5
+// GFX11: v_rsq_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5.l, src_scc
+// GFX11: v_rsq_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5.l, 0xfe0b
+// GFX11: v_rsq_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xac,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rsq_f16 v5.h, src_scc
 // GFX11: v_rsq_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xac,0x0a,0x7f]
@@ -3416,6 +3744,12 @@ v_sin_f16 v5.l, v1.l
 v_sin_f16 v5.l, v127.l
 // GFX11: v_sin_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
+v_sin_f16 v5.l, v1.h
+// GFX11: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.l, v127.h
+// GFX11: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
+
 v_sin_f16 v5.l, s1
 // GFX11: v_sin_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc0,0x0a,0x7e]
 
@@ -3452,17 +3786,8 @@ v_sin_f16 v5.l, 0.5
 v_sin_f16 v5.l, src_scc
 // GFX11: v_sin_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
-v_sin_f16 v127.l, 0xfe0b
-// GFX11: v_sin_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_sin_f16 v5.l, v1.h
-// GFX11: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
-
-v_sin_f16 v5.l, v127.h
-// GFX11: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
-
-v_sin_f16 v127.l, 0.5
-// GFX11: v_sin_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc0,0xfe,0x7e]
+v_sin_f16 v5.l, 0xfe0b
+// GFX11: v_sin_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xc0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sin_f16 v5.h, src_scc
 // GFX11: v_sin_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7f]
@@ -3557,8 +3882,14 @@ v_sqrt_f16 v5.l, null
 v_sqrt_f16 v5.l, -1
 // GFX11: v_sqrt_f16_e32 v5.l, -1                 ; encoding: [0xc1,0xaa,0x0a,0x7e]
 
-v_sqrt_f16 v127.l, 0.5
-// GFX11: v_sqrt_f16_e32 v127.l, 0.5              ; encoding: [0xf0,0xaa,0xfe,0x7e]
+v_sqrt_f16 v5.l, 0.5
+// GFX11: v_sqrt_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5.l, src_scc
+// GFX11: v_sqrt_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5.l, 0xfe0b
+// GFX11: v_sqrt_f16_e32 v5.l, 0xfe0b             ; encoding: [0xff,0xaa,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sqrt_f16 v5.h, src_scc
 // GFX11: v_sqrt_f16_e32 v5.h, src_scc            ; encoding: [0xfd,0xaa,0x0a,0x7f]
@@ -3674,6 +4005,12 @@ v_trunc_f16 v5.l, v1.l
 v_trunc_f16 v5.l, v127.l
 // GFX11: v_trunc_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbb,0x0a,0x7e]
 
+v_trunc_f16 v5.l, v1.h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, v127.h
+// GFX11: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
+
 v_trunc_f16 v5.l, s1
 // GFX11: v_trunc_f16_e32 v5.l, s1                ; encoding: [0x01,0xba,0x0a,0x7e]
 
@@ -3710,17 +4047,8 @@ v_trunc_f16 v5.l, 0.5
 v_trunc_f16 v5.l, src_scc
 // GFX11: v_trunc_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7e]
 
-v_trunc_f16 v127.l, 0xfe0b
-// GFX11: v_trunc_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_trunc_f16 v5.l, v1.h
-// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
-
-v_trunc_f16 v5.l, v127.h
-// GFX11: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
-
-v_trunc_f16 v127.l, 0.5
-// GFX11: v_trunc_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xba,0xfe,0x7e]
+v_trunc_f16 v5.l, 0xfe0b
+// GFX11: v_trunc_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xba,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f16 v5.h, src_scc
 // GFX11: v_trunc_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7f]
@@ -3808,9 +4136,3 @@ v_trunc_f64 v[5:6], src_scc
 
 v_trunc_f64 v[254:255], 0xaf123456
 // GFX11: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-
-v_trunc_f16 v[5].l, v[1].h
-// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
-
-v_trunc_f16 v[5:5].l, v[1:1].h
-// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s
new file mode 100644
index 0000000000000..d7afebe6e5e55
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s
@@ -0,0 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+
+//  INSTS=
+//      v_ceil_f32 OPS32
+//      v_cos_f32 OPS32
+//
+//  SRC32=
+//      v1    # A comment.
+//      0.5
+//
+//  OPS32=
+//      v5, SRC32
+//      v255, 0xaf123456
+
+v_bfrev_b32 v5, v1
+// GFX11: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected
new file mode 100644
index 0000000000000..21ee43a8a06a3
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected
@@ -0,0 +1,32 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+
+//  INSTS=
+//      v_ceil_f32 OPS32
+//      v_cos_f32 OPS32
+//
+//  SRC32=
+//      v1    # A comment.
+//      0.5
+//
+//  OPS32=
+//      v5, SRC32
+//      v255, 0xaf123456
+
+v_ceil_f32 v5, v1
+// GFX11: v_ceil_f32_e32 v5, v1                   ; encoding: [0x01,0x45,0x0a,0x7e]
+
+v_ceil_f32 v5, 0.5
+// GFX11: v_ceil_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x44,0x0a,0x7e]
+
+v_ceil_f32 v255, 0xaf123456
+// GFX11: v_ceil_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cos_f32 v5, v1
+// GFX11: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
+
+v_cos_f32 v5, 0.5
+// GFX11: v_cos_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6c,0x0a,0x7e]
+
+v_cos_f32 v255, 0xaf123456
+// GFX11: v_cos_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test
new file mode 100644
index 0000000000000..6dfdb985d8cdb
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test
@@ -0,0 +1,5 @@
+# REQUIRES: amdgpu-registered-target
+## Test expanding instruction templates.
+
+# RUN: cp -f %S/Inputs/amdgpu-templates.s %t.s && %update_mc_test_checks %t.s
+# RUN: diff -u %S/Inputs//amdgpu-templates.s.expected %t.s
diff --git a/llvm/utils/update_mc_test_checks.py b/llvm/utils/update_mc_test_checks.py
index 363278d1b1f97..9b80267e8ad8c 100755
--- a/llvm/utils/update_mc_test_checks.py
+++ b/llvm/utils/update_mc_test_checks.py
@@ -29,6 +29,11 @@
 ]
 
 
+class Error(Exception):
+    def __init__(self, test_info, line_no, msg):
+        super().__init__(f"{test_info.path}:{line_no}: {msg}")
+
+
 def invoke_tool(exe, check_rc, cmd_args, testline, verbose=False):
     substs = SUBSTITUTIONS + [(t, exe) for t in mc_LIKE_TOOLS]
     args = [common.applySubstitutions(cmd, substs) for cmd in cmd_args.split("|")]
@@ -125,6 +130,62 @@ def getErrCheckLine(prefix, output, mc_mode, line_offset=1):
     )
 
 
+def parse_token_defs(test_info):
+    tokens = {}
+    current_token = None
+    for line_no, line in enumerate(test_info.input_lines, start=1):
+        # Remove comments.
+        line = line.split("#")[0].rstrip()
+
+        # Skip everything up to the instructions definition.
+        if not tokens and not current_token and line != "//  INSTS=":
+            continue
+
+        if not line.startswith("//"):
+            break
+
+        original_len = len(line)
+        line = line[2:].lstrip(" ")
+        indent = original_len - len(line)
+
+        if not line:
+            current_token = None
+            continue
+
+        # Define a new token.
+        if not current_token:
+            if indent != 4 or not line.endswith("="):
+                raise Error(test_info, line_no, "token definition expected")
+
+            current_token = line[:-1].strip()
+            if current_token in tokens:
+                raise Error(test_info, line_no, f"'{current_token}' redefined")
+
+            tokens[current_token] = []
+            continue
+
+        # Add token value.
+        if indent != 8:
+            raise Error(test_info, line_no, "wrong indentation for token value")
+
+        tokens[current_token].append(line)
+
+    return tokens
+
+
+def expand_insts(tokens):
+    def subst(s):
+        for token, values in tokens.items():
+            if token in s:
+                for value in values:
+                    yield from subst(s.replace(token, value, 1))
+                return
+
+        yield s
+
+    yield from subst("INSTS")
+
+
 def update_test(ti: common.TestInfo):
     if ti.path.endswith(".s"):
         mc_mode = "asm"
@@ -209,6 +270,14 @@ def update_test(ti: common.TestInfo):
     testlines = list(dict.fromkeys(testlines))
     common.debug("Valid test line found: ", len(testlines))
 
+    # Where instruction templates are specified, use them instead.
+    use_asm_templates = False
+    if mc_mode == "asm":
+        tokens = parse_token_defs(ti)
+        if "INSTS" in tokens:
+            testlines = list(expand_insts(tokens))
+            use_asm_templates = True
+
     raw_output = []
     raw_prefixes = []
     for (
@@ -244,7 +313,6 @@ def update_test(ti: common.TestInfo):
 
         raw_prefixes.append(prefixes)
 
-    output_lines = []
     generated_prefixes = {}
     sort_keys = {}
     used_prefixes = set()
@@ -321,14 +389,32 @@ def update_test(ti: common.TestInfo):
         generated_prefixes[input_line] = "\n".join(check_lines)
 
     # write output
-    for input_info in ti.iterlines(output_lines):
-        input_line = input_info.line
-        if input_line in testlines:
-            output_lines.append(input_line)
-            output_lines.append(generated_prefixes[input_line])
-
-        elif should_add_line_to_output(input_line, prefix_set, mc_mode):
-            output_lines.append(input_line)
+    output_lines = []
+    if use_asm_templates:
+        # Keep all leading comments and empty lines.
+        for input_info in ti.iterlines(output_lines):
+            input_line = input_info.line
+            if not input_line or input_line.startswith(COMMENT[mc_mode]):
+                output_lines.append(input_line)
+                continue
+            break
+
+        # Remove tail empty lines.
+        while not output_lines[-1]:
+            del output_lines[-1]
+
+        # Emit test and check lines.
+        for input_line in testlines:
+            output_lines.extend(["", input_line, generated_prefixes[input_line]])
+    else:
+        for input_info in ti.iterlines(output_lines):
+            input_line = input_info.line
+            if input_line in testlines:
+                output_lines.append(input_line)
+                output_lines.append(generated_prefixes[input_line])
+
+            elif should_add_line_to_output(input_line, prefix_set, mc_mode):
+                output_lines.append(input_line)
 
     if ti.args.unique or ti.args.sort:
         # split with double newlines

From d5927a6172ab9b95f7f533bfdff865c1ce2aad5b Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Mon, 24 Nov 2025 19:08:53 +0500
Subject: [PATCH 10/19] [LLDB] Add unary plus and minus to DIL (#155617)

This patch adds unary nodes plus and minus, introduces unary type
conversions, and adds integral promotion to the type system.
---
 lldb/docs/dil-expr-lang.ebnf                  |   2 +-
 lldb/include/lldb/Symbol/TypeSystem.h         |  12 ++
 lldb/include/lldb/ValueObject/DILAST.h        |   2 +
 lldb/include/lldb/ValueObject/DILEval.h       |   4 +
 .../TypeSystem/Clang/TypeSystemClang.cpp      |  96 +++++++++
 .../TypeSystem/Clang/TypeSystemClang.h        |   8 +
 lldb/source/Symbol/CompilerType.cpp           |  28 +--
 lldb/source/Symbol/TypeSystem.cpp             |  11 +
 lldb/source/ValueObject/DILEval.cpp           | 196 +++++++++++++++---
 lldb/source/ValueObject/DILParser.cpp         |  12 +-
 .../frame/var-dil/expr/Arithmetic/Makefile    |   3 +
 .../Arithmetic/TestFrameVarDILArithmetic.py   |  46 ++++
 .../frame/var-dil/expr/Arithmetic/main.cpp    |  23 ++
 .../var-dil/expr/PointerArithmetic/Makefile   |   3 +
 .../TestFrameVarDILPointerArithmetic.py       |  29 +++
 .../var-dil/expr/PointerArithmetic/main.cpp   |  11 +
 16 files changed, 427 insertions(+), 59 deletions(-)
 create mode 100644 lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile
 create mode 100644 lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
 create mode 100644 lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp
 create mode 100644 lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile
 create mode 100644 lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py
 create mode 100644 lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp

diff --git a/lldb/docs/dil-expr-lang.ebnf b/lldb/docs/dil-expr-lang.ebnf
index 70eda3bf40650..ccd2b00223910 100644
--- a/lldb/docs/dil-expr-lang.ebnf
+++ b/lldb/docs/dil-expr-lang.ebnf
@@ -8,7 +8,7 @@ expression = unary_expression ;
 unary_expression = postfix_expression
                  | unary_operator expression ;
 
-unary_operator = "*" | "&" ;
+unary_operator = "*" | "&" | "+" | "-";
 
 postfix_expression = primary_expression
                    | postfix_expression "[" integer_literal "]"
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 25b208a65349b..99ea0585e5370 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -411,6 +411,18 @@ class TypeSystem : public PluginInterface,
   GetIntegralTemplateArgument(lldb::opaque_compiler_type_t type, size_t idx,
                               bool expand_pack);
 
+  // DIL
+
+  /// Checks if the type is eligible for integral promotion.
+  virtual bool IsPromotableIntegerType(lldb::opaque_compiler_type_t type);
+
+  /// Perform integral promotion on a given type.
+  /// This promotes eligible types (boolean, integers, unscoped enumerations)
+  /// to a larger integer type according to type system rules.
+  /// \returns Promoted type.
+  virtual llvm::Expected<CompilerType>
+  DoIntegralPromotion(CompilerType from, ExecutionContextScope *exe_scope);
+
   // Dumping types
 
 #ifndef NDEBUG
diff --git a/lldb/include/lldb/ValueObject/DILAST.h b/lldb/include/lldb/ValueObject/DILAST.h
index 0f05d753f1b56..91f8d93c09622 100644
--- a/lldb/include/lldb/ValueObject/DILAST.h
+++ b/lldb/include/lldb/ValueObject/DILAST.h
@@ -33,6 +33,8 @@ enum class NodeKind {
 enum class UnaryOpKind {
   AddrOf, // "&"
   Deref,  // "*"
+  Minus,  // "-"
+  Plus,   // "+"
 };
 
 /// Forward declaration, for use in DIL AST nodes. Definition is at the very
diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h
index eab3218ff828f..a65edc58cc4e7 100644
--- a/lldb/include/lldb/ValueObject/DILEval.h
+++ b/lldb/include/lldb/ValueObject/DILEval.h
@@ -61,6 +61,10 @@ class Interpreter : Visitor {
   llvm::Expected<lldb::ValueObjectSP>
   Visit(const BooleanLiteralNode *node) override;
 
+  /// Perform usual unary conversions on a value. At the moment this
+  /// includes array-to-pointer and integral promotion for eligible types.
+  llvm::Expected<lldb::ValueObjectSP>
+  UnaryConversion(lldb::ValueObjectSP valobj, uint32_t location);
   llvm::Expected<CompilerType>
   PickIntegerType(lldb::TypeSystemSP type_system,
                   std::shared_ptr<ExecutionContextScope> ctx,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 51cb883748514..aa8d309fbc730 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -7346,6 +7346,102 @@ CompilerType TypeSystemClang::GetTypeForFormatters(void *type) {
   return CompilerType();
 }
 
+bool TypeSystemClang::IsPromotableIntegerType(
+    lldb::opaque_compiler_type_t type) {
+  // Unscoped enums are always considered as promotable, even if their
+  // underlying type does not need to be promoted (e.g. "int").
+  bool is_signed = false;
+  bool isUnscopedEnumerationType =
+      IsEnumerationType(type, is_signed) && !IsScopedEnumerationType(type);
+  if (isUnscopedEnumerationType)
+    return true;
+
+  switch (GetBasicTypeEnumeration(type)) {
+  case lldb::eBasicTypeBool:
+  case lldb::eBasicTypeChar:
+  case lldb::eBasicTypeSignedChar:
+  case lldb::eBasicTypeUnsignedChar:
+  case lldb::eBasicTypeShort:
+  case lldb::eBasicTypeUnsignedShort:
+  case lldb::eBasicTypeWChar:
+  case lldb::eBasicTypeSignedWChar:
+  case lldb::eBasicTypeUnsignedWChar:
+  case lldb::eBasicTypeChar16:
+  case lldb::eBasicTypeChar32:
+    return true;
+
+  default:
+    return false;
+  }
+
+  llvm_unreachable("All cases handled above.");
+}
+
+llvm::Expected<CompilerType>
+TypeSystemClang::DoIntegralPromotion(CompilerType from,
+                                     ExecutionContextScope *exe_scope) {
+  if (!from.IsInteger() && !from.IsUnscopedEnumerationType())
+    return from;
+
+  if (!from.IsPromotableIntegerType())
+    return from;
+
+  if (from.IsUnscopedEnumerationType()) {
+    EnumDecl *enum_decl = GetAsEnumDecl(from);
+    CompilerType promotion_type = GetType(enum_decl->getPromotionType());
+    return DoIntegralPromotion(promotion_type, exe_scope);
+  }
+
+  lldb::BasicType builtin_type =
+      from.GetCanonicalType().GetBasicTypeEnumeration();
+  uint64_t from_size = 0;
+  if (builtin_type == lldb::eBasicTypeWChar ||
+      builtin_type == lldb::eBasicTypeSignedWChar ||
+      builtin_type == lldb::eBasicTypeUnsignedWChar ||
+      builtin_type == lldb::eBasicTypeChar16 ||
+      builtin_type == lldb::eBasicTypeChar32) {
+    // Find the type that can hold the entire range of values for our type.
+    bool is_signed = from.IsSigned();
+    llvm::Expected<uint64_t> from_size = from.GetByteSize(exe_scope);
+    if (!from_size)
+      return from_size.takeError();
+    CompilerType promote_types[] = {
+        GetBasicTypeFromAST(lldb::eBasicTypeInt),
+        GetBasicTypeFromAST(lldb::eBasicTypeUnsignedInt),
+        GetBasicTypeFromAST(lldb::eBasicTypeLong),
+        GetBasicTypeFromAST(lldb::eBasicTypeUnsignedLong),
+        GetBasicTypeFromAST(lldb::eBasicTypeLongLong),
+        GetBasicTypeFromAST(lldb::eBasicTypeUnsignedLongLong),
+    };
+    for (CompilerType &type : promote_types) {
+      llvm::Expected<uint64_t> byte_size = type.GetByteSize(exe_scope);
+      if (!byte_size)
+        return byte_size.takeError();
+      if (*from_size < *byte_size ||
+          (*from_size == *byte_size && is_signed == type.IsSigned())) {
+        return type;
+      }
+    }
+    llvm_unreachable("char type should fit into long long");
+  }
+
+  // Here we can promote only to "int" or "unsigned int".
+  CompilerType int_type = GetBasicTypeFromAST(lldb::eBasicTypeInt);
+  llvm::Expected<uint64_t> int_byte_size = int_type.GetByteSize(exe_scope);
+  if (!int_byte_size)
+    return int_byte_size.takeError();
+
+  // Signed integer types can be safely promoted to "int".
+  if (from.IsSigned()) {
+    return int_type;
+  }
+  // Unsigned integer types are promoted to "unsigned int" if "int" cannot hold
+  // their entire value range.
+  return (from_size == *int_byte_size)
+             ? GetBasicTypeFromAST(lldb::eBasicTypeUnsignedInt)
+             : int_type;
+}
+
 clang::EnumDecl *TypeSystemClang::GetAsEnumDecl(const CompilerType &type) {
   const clang::EnumType *enutype =
       llvm::dyn_cast<clang::EnumType>(ClangUtil::GetCanonicalQualType(type));
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 375891b3cfd2f..67d206e4d2df2 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -938,6 +938,14 @@ class TypeSystemClang : public TypeSystem {
 
   CompilerType GetTypeForFormatters(void *type) override;
 
+  // DIL
+
+  bool IsPromotableIntegerType(lldb::opaque_compiler_type_t type) override;
+
+  llvm::Expected<CompilerType>
+  DoIntegralPromotion(CompilerType from,
+                      ExecutionContextScope *exe_scope) override;
+
 #define LLDB_INVALID_DECL_LEVEL UINT32_MAX
   // LLDB_INVALID_DECL_LEVEL is returned by CountDeclLevels if child_decl_ctx
   // could not be found in decl_ctx.
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index c999ab256fc98..1a39ea9476390 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -370,30 +370,10 @@ bool CompilerType::IsScalarOrUnscopedEnumerationType() const {
 }
 
 bool CompilerType::IsPromotableIntegerType() const {
-  // Unscoped enums are always considered as promotable, even if their
-  // underlying type does not need to be promoted (e.g. "int").
-  if (IsUnscopedEnumerationType())
-    return true;
-
-  switch (GetBasicTypeEnumeration()) {
-  case lldb::eBasicTypeBool:
-  case lldb::eBasicTypeChar:
-  case lldb::eBasicTypeSignedChar:
-  case lldb::eBasicTypeUnsignedChar:
-  case lldb::eBasicTypeShort:
-  case lldb::eBasicTypeUnsignedShort:
-  case lldb::eBasicTypeWChar:
-  case lldb::eBasicTypeSignedWChar:
-  case lldb::eBasicTypeUnsignedWChar:
-  case lldb::eBasicTypeChar16:
-  case lldb::eBasicTypeChar32:
-    return true;
-
-  default:
-    return false;
-  }
-
-  llvm_unreachable("All cases handled above.");
+  if (IsValid())
+    if (auto type_system_sp = GetTypeSystem())
+      return type_system_sp->IsPromotableIntegerType(m_type);
+  return false;
 }
 
 bool CompilerType::IsPointerToVoid() const {
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index f7d634ffa2dec..8712142893835 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -123,6 +123,17 @@ CompilerType TypeSystem::GetTypeForFormatters(void *type) {
   return CompilerType(weak_from_this(), type);
 }
 
+bool TypeSystem::IsPromotableIntegerType(lldb::opaque_compiler_type_t type) {
+  return false;
+}
+
+llvm::Expected<CompilerType>
+TypeSystem::DoIntegralPromotion(CompilerType from,
+                                ExecutionContextScope *exe_scope) {
+  return llvm::createStringError(
+      "Integral promotion is not implemented for this TypeSystem");
+}
+
 bool TypeSystem::IsTemplateType(lldb::opaque_compiler_type_t type) {
   return false;
 }
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index a9dbfad298d05..40a05a467f883 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -21,6 +21,101 @@
 
 namespace lldb_private::dil {
 
+static llvm::Expected<lldb::TypeSystemSP>
+GetTypeSystemFromCU(std::shared_ptr<ExecutionContextScope> ctx) {
+  auto stack_frame = ctx->CalculateStackFrame();
+  if (!stack_frame)
+    return llvm::createStringError("no stack frame in this context");
+  SymbolContext symbol_context =
+      stack_frame->GetSymbolContext(lldb::eSymbolContextCompUnit);
+  lldb::LanguageType language = symbol_context.comp_unit->GetLanguage();
+
+  symbol_context = stack_frame->GetSymbolContext(lldb::eSymbolContextModule);
+  return symbol_context.module_sp->GetTypeSystemForLanguage(language);
+}
+
+static CompilerType GetBasicType(lldb::TypeSystemSP type_system,
+                                 lldb::BasicType basic_type) {
+  if (type_system)
+    return type_system.get()->GetBasicTypeFromAST(basic_type);
+
+  return CompilerType();
+}
+
+static lldb::ValueObjectSP
+ArrayToPointerConversion(ValueObject &valobj, ExecutionContextScope &ctx) {
+  uint64_t addr = valobj.GetLoadAddress();
+  ExecutionContext exe_ctx;
+  ctx.CalculateExecutionContext(exe_ctx);
+  return ValueObject::CreateValueObjectFromAddress(
+      "result", addr, exe_ctx,
+      valobj.GetCompilerType().GetArrayElementType(&ctx).GetPointerType(),
+      /* do_deref */ false);
+}
+
+llvm::Expected<lldb::ValueObjectSP>
+Interpreter::UnaryConversion(lldb::ValueObjectSP valobj, uint32_t location) {
+  if (!valobj)
+    return llvm::make_error<DILDiagnosticError>(m_expr, "invalid value object",
+                                                location);
+  llvm::Expected<lldb::TypeSystemSP> type_system =
+      GetTypeSystemFromCU(m_exe_ctx_scope);
+  if (!type_system)
+    return type_system.takeError();
+
+  CompilerType in_type = valobj->GetCompilerType();
+  if (valobj->IsBitfield()) {
+    // Promote bitfields. If `int` can represent the bitfield value, it is
+    // converted to `int`. Otherwise, if `unsigned int` can represent it, it
+    // is converted to `unsigned int`. Otherwise, it is treated as its
+    // underlying type.
+    uint32_t bitfield_size = valobj->GetBitfieldBitSize();
+    // Some bitfields have undefined size (e.g. result of ternary operation).
+    // The AST's `bitfield_size` of those is 0, and no promotion takes place.
+    if (bitfield_size > 0 && in_type.IsInteger()) {
+      CompilerType int_type = GetBasicType(*type_system, lldb::eBasicTypeInt);
+      CompilerType uint_type =
+          GetBasicType(*type_system, lldb::eBasicTypeUnsignedInt);
+      llvm::Expected<uint64_t> int_bit_size =
+          int_type.GetBitSize(m_exe_ctx_scope.get());
+      if (!int_bit_size)
+        return int_bit_size.takeError();
+      llvm::Expected<uint64_t> uint_bit_size =
+          uint_type.GetBitSize(m_exe_ctx_scope.get());
+      if (!uint_bit_size)
+        return int_bit_size.takeError();
+      if (bitfield_size < *int_bit_size ||
+          (in_type.IsSigned() && bitfield_size == *int_bit_size))
+        return valobj->CastToBasicType(int_type);
+      if (bitfield_size <= *uint_bit_size)
+        return valobj->CastToBasicType(uint_type);
+      // Re-create as a const value with the same underlying type
+      Scalar scalar;
+      bool resolved = valobj->ResolveValue(scalar);
+      if (!resolved)
+        return llvm::createStringError("invalid scalar value");
+      return ValueObject::CreateValueObjectFromScalar(m_target, scalar, in_type,
+                                                      "result");
+    }
+  }
+
+  if (in_type.IsArrayType())
+    valobj = ArrayToPointerConversion(*valobj, *m_exe_ctx_scope);
+
+  if (valobj->GetCompilerType().IsInteger() ||
+      valobj->GetCompilerType().IsUnscopedEnumerationType()) {
+    llvm::Expected<CompilerType> promoted_type =
+        type_system.get()->DoIntegralPromotion(valobj->GetCompilerType(),
+                                               m_exe_ctx_scope.get());
+    if (!promoted_type)
+      return promoted_type.takeError();
+    if (!promoted_type->CompareTypes(valobj->GetCompilerType()))
+      return valobj->CastToBasicType(*promoted_type);
+  }
+
+  return valobj;
+}
+
 static lldb::VariableSP DILFindVariable(ConstString name,
                                         VariableList &variable_list) {
   lldb::VariableSP exact_match;
@@ -147,6 +242,10 @@ Interpreter::Interpreter(lldb::TargetSP target, llvm::StringRef expr,
 llvm::Expected<lldb::ValueObjectSP> Interpreter::Evaluate(const ASTNode *node) {
   // Evaluate an AST.
   auto value_or_error = node->Accept(this);
+  // Convert SP with a nullptr to an error.
+  if (value_or_error && !*value_or_error)
+    return llvm::make_error<DILDiagnosticError>(m_expr, "invalid value object",
+                                                node->GetLocation());
   // Return the computed value-or-error. The caller is responsible for
   // checking if an error occured during the evaluation.
   return value_or_error;
@@ -175,21 +274,21 @@ Interpreter::Visit(const IdentifierNode *node) {
 llvm::Expected<lldb::ValueObjectSP>
 Interpreter::Visit(const UnaryOpNode *node) {
   Status error;
-  auto rhs_or_err = Evaluate(node->GetOperand());
-  if (!rhs_or_err)
-    return rhs_or_err;
+  auto op_or_err = Evaluate(node->GetOperand());
+  if (!op_or_err)
+    return op_or_err;
 
-  lldb::ValueObjectSP rhs = *rhs_or_err;
+  lldb::ValueObjectSP operand = *op_or_err;
 
   switch (node->GetKind()) {
   case UnaryOpKind::Deref: {
-    lldb::ValueObjectSP dynamic_rhs = rhs->GetDynamicValue(m_use_dynamic);
-    if (dynamic_rhs)
-      rhs = dynamic_rhs;
+    lldb::ValueObjectSP dynamic_op = operand->GetDynamicValue(m_use_dynamic);
+    if (dynamic_op)
+      operand = dynamic_op;
 
-    lldb::ValueObjectSP child_sp = rhs->Dereference(error);
+    lldb::ValueObjectSP child_sp = operand->Dereference(error);
     if (!child_sp && m_use_synthetic) {
-      if (lldb::ValueObjectSP synth_obj_sp = rhs->GetSyntheticValue()) {
+      if (lldb::ValueObjectSP synth_obj_sp = operand->GetSyntheticValue()) {
         error.Clear();
         child_sp = synth_obj_sp->Dereference(error);
       }
@@ -202,18 +301,69 @@ Interpreter::Visit(const UnaryOpNode *node) {
   }
   case UnaryOpKind::AddrOf: {
     Status error;
-    lldb::ValueObjectSP value = rhs->AddressOf(error);
+    lldb::ValueObjectSP value = operand->AddressOf(error);
     if (error.Fail())
       return llvm::make_error<DILDiagnosticError>(m_expr, error.AsCString(),
                                                   node->GetLocation());
 
     return value;
   }
+  case UnaryOpKind::Minus: {
+    if (operand->GetCompilerType().IsReferenceType()) {
+      operand = operand->Dereference(error);
+      if (error.Fail())
+        return error.ToError();
+    }
+    llvm::Expected<lldb::ValueObjectSP> conv_op =
+        UnaryConversion(operand, node->GetOperand()->GetLocation());
+    if (!conv_op)
+      return conv_op;
+    operand = *conv_op;
+    CompilerType operand_type = operand->GetCompilerType();
+    if (!operand_type.IsScalarType()) {
+      std::string errMsg =
+          llvm::formatv("invalid argument type '{0}' to unary expression",
+                        operand_type.GetTypeName());
+      return llvm::make_error<DILDiagnosticError>(m_expr, errMsg,
+                                                  node->GetLocation());
+    }
+    Scalar scalar;
+    bool resolved = operand->ResolveValue(scalar);
+    if (!resolved)
+      break;
+
+    bool negated = scalar.UnaryNegate();
+    if (negated)
+      return ValueObject::CreateValueObjectFromScalar(
+          m_target, scalar, operand->GetCompilerType(), "result");
+    break;
   }
-
-  // Unsupported/invalid operation.
-  return llvm::make_error<DILDiagnosticError>(
-      m_expr, "invalid ast: unexpected binary operator", node->GetLocation());
+  case UnaryOpKind::Plus: {
+    if (operand->GetCompilerType().IsReferenceType()) {
+      operand = operand->Dereference(error);
+      if (error.Fail())
+        return error.ToError();
+    }
+    llvm::Expected<lldb::ValueObjectSP> conv_op =
+        UnaryConversion(operand, node->GetOperand()->GetLocation());
+    if (!conv_op)
+      return conv_op;
+    operand = *conv_op;
+    CompilerType operand_type = operand->GetCompilerType();
+    if (!operand_type.IsScalarType() &&
+        // Unary plus is allowed for pointers.
+        !operand_type.IsPointerType()) {
+      std::string errMsg =
+          llvm::formatv("invalid argument type '{0}' to unary expression",
+                        operand_type.GetTypeName());
+      return llvm::make_error<DILDiagnosticError>(m_expr, errMsg,
+                                                  node->GetLocation());
+    }
+    return operand;
+  }
+  }
+  return llvm::make_error<DILDiagnosticError>(m_expr, "invalid unary operation",
+                                              node->GetLocation());
 }
 
 llvm::Expected<lldb::ValueObjectSP>
@@ -499,24 +649,6 @@ Interpreter::Visit(const BitFieldExtractionNode *node) {
   return child_valobj_sp;
 }
 
-static llvm::Expected<lldb::TypeSystemSP>
-GetTypeSystemFromCU(std::shared_ptr<StackFrame> ctx) {
-  SymbolContext symbol_context =
-      ctx->GetSymbolContext(lldb::eSymbolContextCompUnit);
-  lldb::LanguageType language = symbol_context.comp_unit->GetLanguage();
-
-  symbol_context = ctx->GetSymbolContext(lldb::eSymbolContextModule);
-  return symbol_context.module_sp->GetTypeSystemForLanguage(language);
-}
-
-static CompilerType GetBasicType(lldb::TypeSystemSP type_system,
-                                 lldb::BasicType basic_type) {
-  if (type_system)
-    return type_system.get()->GetBasicTypeFromAST(basic_type);
-
-  return CompilerType();
-}
-
 llvm::Expected<CompilerType>
 Interpreter::PickIntegerType(lldb::TypeSystemSP type_system,
                              std::shared_ptr<ExecutionContextScope> ctx,
diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp
index 82b97aafe2261..072ddff1e28d2 100644
--- a/lldb/source/ValueObject/DILParser.cpp
+++ b/lldb/source/ValueObject/DILParser.cpp
@@ -93,9 +93,12 @@ ASTNodeUP DILParser::ParseExpression() { return ParseUnaryExpression(); }
 //  unary_operator:
 //    "&"
 //    "*"
+//    "+"
+//    "-"
 //
 ASTNodeUP DILParser::ParseUnaryExpression() {
-  if (CurToken().IsOneOf({Token::amp, Token::star})) {
+  if (CurToken().IsOneOf(
+          {Token::amp, Token::star, Token::minus, Token::plus})) {
     Token token = CurToken();
     uint32_t loc = token.GetLocation();
     m_dil_lexer.Advance();
@@ -107,7 +110,12 @@ ASTNodeUP DILParser::ParseUnaryExpression() {
     case Token::amp:
       return std::make_unique<UnaryOpNode>(loc, UnaryOpKind::AddrOf,
                                            std::move(rhs));
-
+    case Token::minus:
+      return std::make_unique<UnaryOpNode>(loc, UnaryOpKind::Minus,
+                                           std::move(rhs));
+    case Token::plus:
+      return std::make_unique<UnaryOpNode>(loc, UnaryOpKind::Plus,
+                                           std::move(rhs));
     default:
       llvm_unreachable("invalid token kind");
     }
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
new file mode 100644
index 0000000000000..53a85fed303f4
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
@@ -0,0 +1,46 @@
+"""
+Test DIL arithmetic.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestFrameVarDILArithmetic(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_arithmetic(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp")
+        )
+
+        self.runCmd("settings set target.experimental.use-DIL true")
+
+        # Check unary results and integral promotion
+        self.expect_var_path("+0", value="0")
+        self.expect_var_path("-0", value="0")
+        self.expect_var_path("+1", value="1")
+        self.expect_var_path("-1", value="-1")
+        self.expect_var_path("-9223372036854775808", value="9223372036854775808")
+        self.expect_var_path("s", value="10", type="short")
+        self.expect_var_path("+s", value="10", type="int")
+        self.expect_var_path("-s", value="-10", type="int")
+        self.expect_var_path("+us", value="1", type="int")
+        self.expect_var_path("-us", value="-1", type="int")
+        self.expect_var_path("+ref", value="2", type="int")
+        self.expect_var_path("-ref", value="-2", type="int")
+        self.expect_var_path("+0.0", value="0")
+        self.expect_var_path("-0.0", value="-0")
+        self.expect_var_path("+enum_one", value="1")
+        self.expect_var_path("-enum_one", value="-1")
+        self.expect_var_path("+wchar", value="1")
+        self.expect_var_path("+char16", value="2")
+        self.expect_var_path("+char32", value="3")
+        self.expect_var_path("-bitfield.a", value="-1", type="int")
+        self.expect_var_path("+bitfield.a", value="1", type="int")
+        self.expect_var_path("+bitfield.b", value="2", type="int")
+        self.expect_var_path("+bitfield.c", value="3", type="unsigned int")
+        self.expect_var_path("+bitfield.d", value="4", type="uint64_t")
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp
new file mode 100644
index 0000000000000..2c70e93433f5f
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp
@@ -0,0 +1,23 @@
+#include <cstdint>
+
+int main(int argc, char **argv) {
+  short s = 10;
+  unsigned short us = 1;
+
+  int x = 2;
+  int &ref = x;
+  enum Enum { kZero, kOne } enum_one = kOne;
+  wchar_t wchar = 1;
+  char16_t char16 = 2;
+  char32_t char32 = 3;
+
+  struct BitFieldStruct {
+    char a : 4;
+    int b : 32;
+    unsigned int c : 32;
+    uint64_t d : 48;
+  };
+  BitFieldStruct bitfield = {1, 2, 3, 4};
+
+  return 0; // Set a breakpoint here
+}
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py
new file mode 100644
index 0000000000000..88429b370710e
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py
@@ -0,0 +1,29 @@
+"""
+Test DIL pointer arithmetic.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestFrameVarDILPointerArithmetic(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_pointer_arithmetic(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp")
+        )
+
+        self.runCmd("settings set target.experimental.use-DIL true")
+
+        self.expect_var_path("+array", type="int *")
+        self.expect_var_path("+array_ref", type="int *")
+        self.expect_var_path("+p_int0", type="int *")
+        self.expect(
+            "frame var -- '-p_int0'",
+            error=True,
+            substrs=["invalid argument type 'int *' to unary expression"],
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
new file mode 100644
index 0000000000000..b4e0e88b1ffc9
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
@@ -0,0 +1,11 @@
+void stop() {}
+
+int main(int argc, char **argv) {
+  int array[10];
+  array[0] = 0;
+  int (&array_ref)[10] = array;
+  int *p_int0 = &array[0];
+
+  stop(); // Set a breakpoint here
+  return 0;
+}

From cd13d9f9e5af7dad1b389f70bb01854134cb9df5 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Mon, 24 Nov 2025 14:16:58 +0000
Subject: [PATCH 11/19] [lldb] Add test showing UnwindAssemblyInstEmulation
 can't handle backwards branches (#168398)

If we have a conditional branch, followed by an epilogue, followed by
more code, LLDB will incorrectly compute unwind information through
instruction emulation. Consider this:

```
// ...
<+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE

// epilogue start
<+20>: ldp    x29, x30, [sp, #0x20]
<+24>: add    sp, sp, #0x30
<+28>: ret
// epilogue end

AFTER_EPILOGUE:
<+32>: do something
// ...
<+48>: ret

DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE:
<+52>: stp    x22, x23, [sp, #0x10]
<+56>: mov    x22, #0x1
<+64>: b      ; <+32> AFTER_EPILOGUE
```

LLDB will think that the unwind state of +32 is the same as +28. This is
false, as +32 _never_ executes after +28.

The root cause of the problem is the order in which instructions are
visited; they are visited in the order they appear in the text, with
unwind state always being forwarded to positive branch offsets, but
never to negative offsets.

In the example above, `AFTER_EPILOGUE` should inherit the state of the
branch in +64, but it doesn't because `AFTER_EPILOGUE` is visited right
after the `ret` in +28.

Fixing this should be simple: maintain a stack of instructions to visit.
While the stack is not empty, take the next instruction on stack and
visit it.
* After visiting a non-branching instruction, push the next instruction
and forward unwind state to it.
* After visiting a branch with one or more known targets, push the known
branch targets and forward state to them.
* In all other cases (ret, or branch to register), don't push nor
forward anything.

Never push an instruction already on the stack. Like the algorithm
today, this new algorithm also assumes that, if two instructions branch
to the same target, the unwind state in both better be the same.

(Note: yes, branch to register is also handled incorrectly today, and
will still be incorrect).
---
 .../ARM64/TestArm64InstEmulation.cpp          | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
index 033c300ad6926..e28366e9f0432 100644
--- a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
+++ b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
@@ -964,3 +964,110 @@ TEST_F(TestArm64InstEmulation, TestPrologueStartsWithStrD8) {
     EXPECT_TRUE(regloc.IsSame());
   }
 }
+
+TEST_F(TestArm64InstEmulation, TestMidFunctionEpilogueAndBackwardsJump) {
+  ArchSpec arch("arm64-apple-ios15");
+  std::unique_ptr<UnwindAssemblyInstEmulation> engine(
+      static_cast<UnwindAssemblyInstEmulation *>(
+          UnwindAssemblyInstEmulation::CreateInstance(arch)));
+  ASSERT_NE(nullptr, engine);
+
+  const UnwindPlan::Row *row;
+  AddressRange sample_range;
+  UnwindPlan unwind_plan(eRegisterKindLLDB);
+  UnwindPlan::Row::AbstractRegisterLocation regloc;
+
+  // clang-format off
+  uint8_t data[] = {
+      0xff, 0xc3, 0x00, 0xd1, // <+0>:  sub    sp, sp, #0x30
+      0xfd, 0x7b, 0x02, 0xa9, // <+4>:  stp    x29, x30, [sp, #0x20]
+      0xfd, 0x83, 0x00, 0x91, // <+8>:  add    x29, sp, #0x20
+      0x1f, 0x04, 0x00, 0xf1, // <+12>: cmp    x0, #0x1
+      0x21, 0x01, 0x00, 0x54, // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+      0xfd, 0x7b, 0x42, 0xa9, // <+20>: ldp    x29, x30, [sp, #0x20]
+      0xff, 0xc3, 0x00, 0x91, // <+24>: add    sp, sp, #0x30
+      0xc0, 0x03, 0x5f, 0xd6, // <+28>: ret
+      // AFTER_EPILOGUE:  LLDB computes the next 5 unwind states incorrectly.
+      0x37, 0x00, 0x80, 0xd2, // <+32>: mov    x23, #0x1
+      0xf6, 0x5f, 0x41, 0xa9, // <+36>: ldp    x22, x23, [sp, #0x10]
+      0xfd, 0x7b, 0x42, 0xa9, // <+40>: ldp    x29, x30, [sp, #0x20]
+      0xff, 0xc3, 0x00, 0x91, // <+44>: add    sp, sp, #0x30
+      0xc0, 0x03, 0x5f, 0xd6, // <+48>: ret
+      // DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+      0xf6, 0x5f, 0x01, 0xa9, // <+52>: stp    x22, x23, [sp, #0x10]
+      0x36, 0x00, 0x80, 0xd2, // <+56>: mov    x22, #0x1
+      0x37, 0x00, 0x80, 0xd2, // <+60>: mov    x23, #0x1
+      0xf8, 0xff, 0xff, 0x17, // <+64>: b      ; <+32> AFTER_EPILOGUE
+  };
+
+  // UnwindPlan we expect:
+  // row[0]:    0: CFA=sp +0 =>
+  // row[1]:    4: CFA=sp+48 =>
+  // row[2]:    8: CFA=sp+16 => fp=[CFA-16] lr=[CFA-8]
+  // row[3]:   12: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8]
+  // row[4]:   24: CFA=sp+48 => fp=<same>   lr=<same>
+  //
+  // This must come from +56
+  // row[5]:   32: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8] x22=[CFA-24], x23=[CFA-32]
+  // row[6]:   40: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8] x22=same,     x23 = same
+  // row[6]:   44: CFA=sp+48 => fp=same     lr=same    x22=same,     x23 = same
+  // row[6]:   48: CFA=sp0   => fp=same     lr=same    x22=same,     x23 = same
+  //
+  // row[x]:   52: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8]
+  // row[x]:   56: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8] x22=[CFA-24], x23=[CFA-32]
+  // clang-format on
+
+  sample_range = AddressRange(0x1000, sizeof(data));
+
+  EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
+      sample_range, data, sizeof(data), unwind_plan));
+
+  // At the end of prologue (+12), CFA = fp + 16.
+  // <+0>:  sub    sp, sp, #0x30
+  // <+4>:  stp    x29, x30, [sp, #0x20]
+  // <+8>:  add    x29, sp, #0x20
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_EQ(row->GetCFAValue().GetRegisterNumber(), gpr_fp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 16);
+
+  // +16 and +20 are the same as +12.
+  // <+12>: cmp    x0, #0x1
+  // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+  EXPECT_EQ(12, unwind_plan.GetRowForFunctionOffset(16)->GetOffset());
+  EXPECT_EQ(12, unwind_plan.GetRowForFunctionOffset(20)->GetOffset());
+
+  // After restoring $fp to caller's value, CFA = $sp + 48
+  // <+20>: ldp    x29, x30, [sp, #0x20]
+  row = unwind_plan.GetRowForFunctionOffset(24);
+  EXPECT_EQ(24, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 48);
+
+  // $sp has been restored
+  // <+24>: add    sp, sp, #0x30
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 0);
+
+  // FIXME: Row for offset +32 incorrectly inherits the state of the `ret`
+  // instruction, but +32 _never_ executes after the `ret`.
+  // <+28>: ret
+  // <+32>: mov    x23, #0x1
+  row = unwind_plan.GetRowForFunctionOffset(32);
+  // FIXME: EXPECT_NE(32, row->GetOffset());
+
+  // Check that the state of this branch
+  // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+  // was forwarded to the branch target:
+  // <+52>: stp    x22, x23, [sp, #0x10]
+  row = unwind_plan.GetRowForFunctionOffset(52);
+  EXPECT_EQ(52, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_EQ(row->GetCFAValue().GetRegisterNumber(), gpr_fp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 16);
+}

From 4a567e3e7c35257e47ee2fb6de61c2c4fb0d4af0 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 24 Nov 2025 14:24:32 +0000
Subject: [PATCH 12/19] [llvm][utils][lit] Fix imports in ManyTests.py example
 (#169328)

Fixes #169297
---
 llvm/utils/lit/examples/many-tests/ManyTests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/lit/examples/many-tests/ManyTests.py b/llvm/utils/lit/examples/many-tests/ManyTests.py
index 89e818a037c39..ffdbbad5a77b1 100644
--- a/llvm/utils/lit/examples/many-tests/ManyTests.py
+++ b/llvm/utils/lit/examples/many-tests/ManyTests.py
@@ -1,4 +1,5 @@
-from lit import Test, TestFormat
+from lit import Test
+from lit.formats import TestFormat
 
 
 class ManyTests(TestFormat):

From 24abb0603a5f491943d05ea3a2b6513238d9937e Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Mon, 24 Nov 2025 06:25:28 -0800
Subject: [PATCH 13/19] [OpenAC][CIR] func-local-declare 'copy' clause lowering
 (#169115)

This patch implements the lowering for the 'copy' clause for a
function-local declare directive.

This is the first of the clauses that requires a 'cleanup' step, so it
also includes some basic infrastructure for that. Fortunately there are
only 8 clauses (only 6 of which require cleanup), so the if/else chain
won't get too long.

Also fortunately, we don't have to include any of the AST components, as
it is possible to tell all the required details from the entry operation
itself.
---
 clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp   |  50 ++++-
 clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp |  10 +-
 .../test/CIR/CodeGenOpenACC/declare-copy.cpp  | 199 ++++++++++++++++++
 3 files changed, 248 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenOpenACC/declare-copy.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
index 551027bb1c8eb..581a6ca81e2c4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
@@ -19,18 +19,52 @@ using namespace clang::CIRGen;
 
 namespace {
 struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
+  SourceRange declareRange;
   mlir::acc::DeclareEnterOp enterOp;
 
-  OpenACCDeclareCleanup(mlir::acc::DeclareEnterOp enterOp) : enterOp(enterOp) {}
+  OpenACCDeclareCleanup(SourceRange declareRange,
+                        mlir::acc::DeclareEnterOp enterOp)
+      : declareRange(declareRange), enterOp(enterOp) {}
+
+  template <typename OutTy, typename InTy>
+  void createOutOp(CIRGenFunction &cgf, InTy inOp) {
+    auto outOp =
+        OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp, inOp.getVarPtr(),
+                      inOp.getStructured(), inOp.getImplicit(),
+                      llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+    outOp.setDataClause(inOp.getDataClause());
+    outOp.setModifiers(inOp.getModifiers());
+  }
 
   void emit(CIRGenFunction &cgf) override {
-    mlir::acc::DeclareExitOp::create(cgf.getBuilder(), enterOp.getLoc(),
-                                     enterOp, {});
+    auto exitOp = mlir::acc::DeclareExitOp::create(
+        cgf.getBuilder(), enterOp.getLoc(), enterOp, {});
 
-    // TODO(OpenACC): Some clauses require that we add info about them to the
-    // DeclareExitOp.  However, we don't have any of those implemented yet, so
-    // we should add infrastructure here to do that once we have one
-    // implemented.
+    // Some data clauses need to be referenced in 'exit', AND need to have an
+    // operation after the exit.  Copy these from the enter operation.
+    for (mlir::Value val : enterOp.getDataClauseOperands()) {
+      if (auto copyin = val.getDefiningOp<mlir::acc::CopyinOp>()) {
+        switch (copyin.getDataClause()) {
+        default:
+          cgf.cgm.errorNYI(declareRange,
+                           "OpenACC local declare clause copyin cleanup");
+          break;
+        case mlir::acc::DataClause::acc_copy:
+          createOutOp<mlir::acc::CopyoutOp>(cgf, copyin);
+          break;
+        }
+      } else if (val.getDefiningOp<mlir::acc::DeclareLinkOp>()) {
+        // Link has no exit clauses, and shouldn't be copied.
+        continue;
+      } else if (val.getDefiningOp<mlir::acc::DevicePtrOp>()) {
+        // DevicePtr has no exit clauses, and shouldn't be copied.
+        continue;
+      } else {
+        cgf.cgm.errorNYI(declareRange, "OpenACC local declare clause cleanup");
+        continue;
+      }
+      exitOp.getDataClauseOperandsMutable().append(val);
+    }
   }
 };
 } // namespace
@@ -45,7 +79,7 @@ void CIRGenFunction::emitOpenACCDeclare(const OpenACCDeclareDecl &d) {
                      d.clauses());
 
   ehStack.pushCleanup<OpenACCDeclareCleanup>(CleanupKind::NormalCleanup,
-                                             enterOp);
+                                             d.getSourceRange(), enterOp);
 }
 
 void CIRGenFunction::emitOpenACCRoutine(const OpenACCRoutineDecl &d) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index c5c6bcd0153a4..621af2344209f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -800,12 +800,16 @@ class OpenACCClauseCIREmitter final
             var, mlir::acc::DataClause::acc_copy, clause.getModifierList(),
             /*structured=*/true,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CopyinOp>(
+            var, mlir::acc::DataClause::acc_copy, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyClause");
     }
   }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-copy.cpp b/clang/test/CIR/CodeGenOpenACC/declare-copy.cpp
new file mode 100644
index 0000000000000..a8a9115a21b29
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-copy.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare copy(always:ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copy(alwaysin:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare copy(alwaysout:LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copy(capture:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare copy(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copy>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {dataClause = #acc<data_clause acc_copy>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSEPtr[1:1]"}
+}
+

From ceea07daa8a41562fdd884a224afbac1d7346e3e Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Mon, 24 Nov 2025 16:30:17 +0200
Subject: [PATCH 14/19] [libc++][forward_list] Applied `[[nodiscard]]`
 (#169019)

`[[nodiscard]]` should be applied to functions where discarding the
return value is most likely a correctness issue.
- https://libcxx.llvm.org/CodingGuidelines.html#apply-nodiscard-where-relevant
---
 libcxx/include/forward_list                   | 28 ++++++++++---------
 .../forward_list.nodiscard.verify.cpp         | 25 +++++++++++++++--
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 272e52d68f46a..56c45d0d46575 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -732,50 +732,52 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT {
     return allocator_type(this->__alloc_);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
     return iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT {
+    return iterator(nullptr);
+  }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
     return const_iterator(nullptr);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
     return const_iterator(nullptr);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT {
     return iterator(__base::__before_begin());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
 
   [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
     return __base::__before_begin()->__next_ == nullptr;
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return std::min<size_type>(__node_traits::max_size(this->__alloc_), numeric_limits<difference_type>::max());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
diff --git a/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp
index 7594a1d299a50..671c7f71ab2a2 100644
--- a/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp
@@ -13,6 +13,27 @@
 #include <forward_list>
 
 void test() {
-  std::forward_list<int> forward_list;
-  forward_list.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::forward_list<int> fl;
+  const std::forward_list<int> cfl;
+
+  fl.get_allocator(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  fl.begin();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.begin();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.end();            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.end();           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.cbegin();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.cbegin();        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.cend();           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.cend();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.before_begin();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.before_begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.cbefore_begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.cbefore_begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  fl.empty();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.max_size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  fl.front();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 }

From 456b0512c927e37640fbdb9f6627466948f64305 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 24 Nov 2025 22:39:26 +0800
Subject: [PATCH 15/19] [VPlan] Set ZeroIsPoison=false for FirstActiveLane
 (#169298)

When interleaving a loop with an early exit, the parts before the active
lane will be all zero. Currently we emit @llvm.experimental.cttz.elts
with ZeroIsPoison=true for these parts, which means that they will
produce poison.

We don't see any miscompiles today on AArch64 because it has the same
lowering for cttz.elts regardless of ZeroIsPoison, but this may cause
issues on RISC-V when interleaving. This fixes it by setting
ZeroIsPoison=false.

The codegen is slightly worse on RISC-V when ZeroIsPoison=false and we
could potentially recover it by enabling it again when UF=1, but this is
left to another PR.

This is split off from #168738, where LastActiveLane can get expanded to
a FirstActiveLane with an all-zeroes mask.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 +--
 .../AArch64/simple_early_exit.ll              | 14 ++--
 .../AArch64/single-early-exit-interleave.ll   |  8 +--
 .../single-early-exit-cond-poison.ll          |  6 +-
 .../single-early-exit-deref-assumptions.ll    | 14 ++--
 .../single-early-exit-interleave.ll           | 64 +++++++++----------
 .../LoopVectorize/single_early_exit.ll        |  4 +-
 .../single_early_exit_live_outs.ll            | 48 +++++++-------
 ...or-loop-backedge-elimination-early-exit.ll |  8 +--
 .../PhaseOrdering/AArch64/std-find.ll         | 17 ++---
 11 files changed, 98 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 405f83a6ce8e5..8a435accfedfe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1096,6 +1096,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // Calculates the first active lane index of the vector predicate operands.
     // It produces the lane index across all unrolled iterations. Unrolling will
     // add all copies of its original operand as additional operands.
+    // Implemented with @llvm.experimental.cttz.elts, but returns the expected
+    // result even with operands that are all zeroes.
     FirstActiveLane,
 
     // The opcodes below are used for VPInstructionWithType.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b27f2f8a3c8cb..5ea9dd349e06f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1015,7 +1015,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     if (getNumOperands() == 1) {
       Value *Mask = State.get(getOperand(0));
       return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
-                                                  true, Name);
+                                                  /*ZeroIsPoison=*/false, Name);
     }
     // If there are multiple operands, create a chain of selects to pick the
     // first operand with an active lane and add the number of lanes of the
@@ -1031,9 +1031,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                     Builder.CreateICmpEQ(State.get(getOperand(Idx)),
                                          Builder.getFalse()),
                     Builder.getInt64Ty())
-              : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
-                                                     State.get(getOperand(Idx)),
-                                                     true, Name);
+              : Builder.CreateCountTrailingZeroElems(
+                    Builder.getInt64Ty(), State.get(getOperand(Idx)),
+                    /*ZeroIsPoison=*/false, Name);
       Value *Current = Builder.CreateAdd(
           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
       if (Res) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 3b016f8d0a9ff..63348ccf94f78 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -44,7 +44,7 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 false)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -125,7 +125,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -187,7 +187,7 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -256,7 +256,7 @@ define i64 @loop_contains_safe_div() #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[INDEX1]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 false)
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -336,7 +336,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -483,12 +483,12 @@ exit:
 define i64 @same_exit_block_requires_interleaving() {
 ; CHECK-LABEL: define i64 @same_exit_block_requires_interleaving() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [128 x %my.struct], align 8
+; CHECK-NEXT:    [[P1:%.*]] = alloca [128 x [[MY_STRUCT:%.*]]], align 8
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 256)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x %my.struct], ptr [[P1]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x [[MY_STRUCT]]], ptr [[P1]], i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_LATCH]], label [[LOOP_END:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index b40a184a3e425..c56f8327a48b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -79,20 +79,20 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 16
-; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
+; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 false)
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP40]], 3
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[TMP41]]
-; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 false)
 ; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP44]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
+; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 false)
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP40]], 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = add i64 [[TMP50]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne i64 [[TMP49]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i64 [[TMP51]], i64 [[TMP48]]
-; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 false)
 ; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP40]], 0
 ; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = icmp ne i64 [[TMP61]], [[TMP40]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
index 794e274a2628c..f11f35319b8fc 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
@@ -31,9 +31,9 @@ define noundef i32 @f(i32 noundef %g) {
 ; VF4IC2:       [[MIDDLE_BLOCK]]:
 ; VF4IC2-NEXT:    br label %[[RETURN:.*]]
 ; VF4IC2:       [[VECTOR_EARLY_EXIT]]:
-; VF4IC2-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; VF4IC2-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; VF4IC2-NEXT:    [[TMP10:%.*]] = add i64 4, [[TMP9]]
-; VF4IC2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; VF4IC2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF4IC2-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP11]], 4
 ; VF4IC2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 [[TMP10]]
@@ -64,7 +64,7 @@ define noundef i32 @f(i32 noundef %g) {
 ; VF8IC1:       [[MIDDLE_BLOCK]]:
 ; VF8IC1-NEXT:    br label %[[RETURN:.*]]
 ; VF8IC1:       [[VECTOR_EARLY_EXIT]]:
-; VF8IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
+; VF8IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 false)
 ; VF8IC1-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
 ; VF8IC1-NEXT:    [[TMP7:%.*]] = add i32 0, [[TMP6]]
 ; VF8IC1-NEXT:    br label %[[RETURN]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 03b7ed7fe2135..0bc2748b6252d 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -28,7 +28,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[LOOP_END:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
 ; CHECK-NEXT:    br label %[[LOOP_END]]
 ; CHECK:       [[LOOP_END]]:
@@ -140,7 +140,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero(ptr n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[LOOP_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    br label %[[LOOP_END_LOOPEXIT]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -336,7 +336,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
@@ -431,7 +431,7 @@ define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %fi
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
@@ -525,7 +525,7 @@ define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last)
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
@@ -602,7 +602,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[LOOP_END:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    br label %[[LOOP_END]]
 ; CHECK:       [[LOOP_END]]:
@@ -740,7 +740,7 @@ define i64 @find_if_pointer_distance_deref_via_assumption(ptr %vec) nofree nosyn
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[BEGIN]], i64 [[TMP13]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index ed5dcc78eeb78..053863117bdc8 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -124,17 +124,17 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -211,17 +211,17 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 true)
+; VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 false)
 ; VF4IC4-NEXT:    [[TMP16:%.*]] = add i64 12, [[TMP15]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP28]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP28]], i1 false)
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = add i64 8, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 [[TMP16]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP14]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP14]], i1 false)
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 4, [[TMP21]]
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP21]], 4
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 [[TMP20]]
-; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP25]], 4
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP27]], i64 [[TMP26]], i64 [[TMP24]]
@@ -304,17 +304,17 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -401,17 +401,17 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -503,17 +503,17 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -623,17 +623,17 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP43]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP43]], i1 false)
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 8, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
-; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 false)
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = add i64 4, [[TMP34]]
 ; VF4IC4-NEXT:    [[TMP36:%.*]] = icmp ne i64 [[TMP34]], 4
 ; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i64 [[TMP35]], i64 [[TMP33]]
-; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 false)
 ; VF4IC4-NEXT:    [[TMP39:%.*]] = add i64 0, [[TMP38]]
 ; VF4IC4-NEXT:    [[TMP40:%.*]] = icmp ne i64 [[TMP38]], 4
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP40]], i64 [[TMP39]], i64 [[TMP37]]
@@ -734,17 +734,17 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE8]], 4
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP20]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 false)
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE9]], 4
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP23]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE1]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP26]]
@@ -861,17 +861,17 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP37]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP37]], i1 false)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 false)
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE15]], 4
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i64 [[TMP29]], i64 [[TMP28]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 false)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
 ; VF4IC4-NEXT:    [[TMP33:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE16]], 4
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i64 [[TMP32]], i64 [[TMP31]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 false)
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    [[TMP36:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE1]], 4
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP36]], i64 [[TMP35]], i64 [[TMP34]]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 4fd8d17073de4..ae03f2426a800 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -424,7 +424,7 @@ define i64 @loop_guard_needed_to_prove_dereferenceable(i32 %x, i1 %cmp2) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       scalar.ph:
@@ -572,7 +572,7 @@ define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr derefe
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[IV_NEXT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 79821b8be1734..55682bc410527 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -32,7 +32,7 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -96,7 +96,7 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[DOTCAST]], 2
@@ -160,7 +160,7 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i64 [[FIRST_ACTIVE_LANE]] to i128
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i128 [[INDEX1]], [[TMP8]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i128 [[TMP9]] to i32
@@ -226,7 +226,7 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[TMP10]] to float
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float 1.000000e+00, [[DOTCAST]]
@@ -294,7 +294,7 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 false)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 5
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP20]]
@@ -357,7 +357,7 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -420,7 +420,7 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -485,7 +485,7 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -549,7 +549,7 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -674,7 +674,7 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -739,7 +739,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -801,7 +801,7 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -861,7 +861,7 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP9]]
@@ -922,7 +922,7 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
@@ -987,7 +987,7 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1122,7 +1122,7 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1189,7 +1189,7 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1258,7 +1258,7 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 3, [[TMP11]]
@@ -1330,7 +1330,7 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i64 [[TMP0]], 1
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 [[START]], [[TMP12]]
@@ -1401,7 +1401,7 @@ define i64 @loop_contains_safe_call() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -1463,7 +1463,7 @@ define i64 @loop_contains_safe_div() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -1526,7 +1526,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -1594,7 +1594,7 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = sub i64 1023, [[TMP12]]
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
@@ -1719,7 +1719,7 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 8da1dca52e87b..ef4d5c6d66700 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -127,7 +127,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
 ; VF8UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
-; VF8UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
+; VF8UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 false)
 ; VF8UF1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; VF8UF1-NEXT:    br label %[[EXIT]]
 ; VF8UF1:       [[EXIT]]:
@@ -156,9 +156,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
-; VF8UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 false)
 ; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 8, [[TMP5]]
-; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP1]], i1 true)
+; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP1]], i1 false)
 ; VF8UF2-NEXT:    [[TMP9:%.*]] = add i64 0, [[TMP8]]
 ; VF8UF2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP8]], 8
 ; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[TMP7]]
@@ -185,7 +185,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
-; VF16UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 true)
+; VF16UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 false)
 ; VF16UF1-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
 ; VF16UF1-NEXT:    br label %[[EXIT]]
 ; VF16UF1:       [[EXIT]]:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
index aea9a80ba6dd0..a727973b43511 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -28,7 +28,7 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK:       [[MIDDLE_SPLIT]]:
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 false)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 [[TMP7]]
@@ -149,13 +149,14 @@ define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_HEADER_I_PREHEADER2:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], -8
-; CHECK:         [[TMP9:%.*]] = getelementptr
-; CHECK-NEXT:         br label %[[VECTOR_BODY:.*]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[XTRAITER]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[PROL_ITER_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX1]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP1]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD_FR:%.*]] = freeze <8 x i16> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD_FR]], splat (i16 1)
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -170,10 +171,10 @@ define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[XTRAITER]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I_PREHEADER2]]
 ; CHECK:       [[LOOP_HEADER_I_PREHEADER2]]:
-; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER_I:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP12]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]

From 1580f4b038c9945bf73d33b25459bece2f67ace7 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 24 Nov 2025 14:42:37 +0000
Subject: [PATCH 16/19] [AArch64] Update costs for fshl/r and add rotr/l
 variants. NFC

---
 llvm/test/Analysis/CostModel/AArch64/fshl.ll | 523 ++++++++++++++-----
 llvm/test/Analysis/CostModel/AArch64/fshr.ll | 523 ++++++++++++++-----
 2 files changed, 790 insertions(+), 256 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 9d06b4bdec9b4..cd6068d382169 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -5,277 +5,544 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i8 @fshl_i8_3rd_arg_const(i8 %a, i8 %b) {
 ; CHECK-LABEL: 'fshl_i8_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
-  ret i8 %fshl
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
+  ret i8 %r
 }
 
 define i8 @fshl_i8_3rd_arg_var(i8 %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: 'fshl_i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
-  ret i8 %fshl
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
+  ret i8 %r
 }
 
-declare i8 @llvm.fshl.i8(i8, i8, i8)
-
-define i16 @fshl_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: 'fshl_i16'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %fshl
+define i16 @fshl_i16_3rd_arg_const(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fshl_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 entry:
-  %fshl = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
-  ret i16 %fshl
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
+  ret i16 %r
 }
 
-declare i16 @llvm.fshl.i16(i16, i16, i16)
+define i16 @fshl_i16_3rd_arg_var(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: 'fshl_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %r
+}
 
 define i32 @fshl_i32_3rd_arg_const(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'fshl_i32_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
-  ret i32 %fshl
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
+  ret i32 %r
 }
 
 define i32 @fshl_i32_3rd_arg_var(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: 'fshl_i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-  ret i32 %fshl
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %r
 }
 
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-
 define i64 @fshl_i64_3rd_arg_const(i64 %a, i64 %b) {
 ; CHECK-LABEL: 'fshl_i64_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
-  ret i64 %fshl
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
+  ret i64 %r
 }
 
 define i64 @fshl_i64_3rd_arg_var(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: 'fshl_i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %r
+}
+
+define i128 @fshl_i128_3rd_arg_const(i128 %a, i128 %b) {
+; CHECK-LABEL: 'fshl_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
-  ret i64 %fshl
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
+  ret i128 %r
 }
 
-declare i64 @llvm.fshl.i64(i64, i64, i64)
+define i128 @fshl_i128_3rd_arg_var(i128 %a, i128 %b, i128 %c) {
+; CHECK-LABEL: 'fshl_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:9 SizeLat:9 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 %c)
+  ret i128 %r
+}
 
 define i19 @fshl_i19(i19 %a, i19 %b) {
 ; CHECK-LABEL: 'fshl_i19'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %r
 ;
 entry:
-  %fshl = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
-  ret i19 %fshl
+  %r = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
+  ret i19 %r
 }
 
-declare i19 @llvm.fshl.i19(i19, i19, i19)
+define i66 @fshl_i66(i66 %a, i66 %b) {
+; CHECK-LABEL: 'fshl_i66'
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %r = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %r
+;
+entry:
+  %r = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
+  ret i66 %r
+}
 
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %fshl
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-  ret <16 x i8> %fshl
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %fshl
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %r
 }
 
-declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %fshl
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-  ret <8 x i16> %fshl
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-  ret <8 x i16> %fshl
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+  ret <8 x i16> %r
 }
 
-declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %fshl
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-  ret <4 x i32> %fshl
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-  ret <4 x i32> %fshl
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %r
 }
 
-declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshl = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
-  ret <2 x i64> %fshl
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshl = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-  ret <2 x i64> %fshl
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshl = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-  ret <2 x i64> %fshl
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %r
 }
 
-declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
 define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %r = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %r
 ;
 entry:
-  %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-  ret <4 x i30> %fshl
+  %r = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+  ret <4 x i30> %r
 }
 
-declare <4 x i30> @llvm.fshl.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
-
 define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %r
 ;
 entry:
-  %fshl = tail call <2 x i66> @llvm.fshl.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-  ret <2 x i66> %fshl
+  %r = tail call <2 x i66> @llvm.fshl.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+  ret <2 x i66> %r
 }
-declare <2 x i66> @llvm.fshl.v4i66(<2 x i66>, <2 x i66>, <2 x i66>)
 
-define i66 @fshl_i66(i66 %a, i66 %b) {
-; CHECK-LABEL: 'fshl_i66'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %fshl = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %fshl
+define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a, <2 x i128> %b) {
+; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
 ;
 entry:
-  %fshl = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
-  ret i66 %fshl
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
 }
 
-declare i66 @llvm.fshl.i66(i66, i66, i66)
-
 define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @fshl_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
+; CHECK-LABEL: 'fshl_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:17 Lat:21 SizeLat:21 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+  ret <2 x i128> %r
+}
+
+
+; Rotate tests
+
+define i8 @rotl_i8_3rd_arg_const(i8 %a) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 9)
+  ret i8 %r
+}
+
+define i8 @rotl_i8_3rd_arg_var(i8 %a, i8 %c) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 %c)
+  ret i8 %r
+}
+
+define i16 @rotl_i16_3rd_arg_const(i16 %a) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 9)
+  ret i16 %r
+}
+
+define i16 @rotl_i16_3rd_arg_var(i16 %a, i16 %c) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %c)
+  ret i16 %r
+}
+
+define i32 @rotl_i32_3rd_arg_const(i32 %a) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
+  ret i32 %r
+}
+
+define i32 @rotl_i32_3rd_arg_var(i32 %a, i32 %c) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %c)
+  ret i32 %r
+}
+
+define i64 @rotl_i64_3rd_arg_const(i64 %a) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+  ret i64 %r
+}
+
+define i64 @rotl_i64_3rd_arg_var(i64 %a, i64 %c) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshl = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-  ret <2 x i128> %fshl
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %c)
+  ret i64 %r
 }
-declare <2 x i128> @llvm.fshl.v4i128(<2 x i128>, <2 x i128>, <2 x i128>)
 
-define i128 @fshl_i128(i128 %a, i128 %b) {
-; CHECK-LABEL: 'fshl_i128'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %fshl = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %fshl
+define i128 @rotl_i128_3rd_arg_const(i128 %a) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshl = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
-  ret i128 %fshl
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 9)
+  ret i128 %r
 }
 
-declare i128 @llvm.fshl.i128(i128, i128, i128)
+define i128 @rotl_i128_3rd_arg_var(i128 %a, i128 %c) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 %c)
+  ret i128 %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %c) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %c) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %c) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %c) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+  ret <2 x i64> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %c) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+  ret <2 x i128> %r
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index b31806b647868..795371e9f3f68 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -5,277 +5,544 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i8 @fshr_i8_3rd_arg_const(i8 %a, i8 %b) {
 ; CHECK-LABEL: 'fshr_i8_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
-  ret i8 %fshr
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
+  ret i8 %r
 }
 
 define i8 @fshr_i8_3rd_arg_var(i8 %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: 'fshr_i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
-  ret i8 %fshr
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
+  ret i8 %r
 }
 
-declare i8 @llvm.fshr.i8(i8, i8, i8)
-
-define i16 @fshr_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: 'fshr_i16'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %fshr
+define i16 @fshr_i16_3rd_arg_const(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fshr_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 entry:
-  %fshr = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
-  ret i16 %fshr
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
+  ret i16 %r
 }
 
-declare i16 @llvm.fshr.i16(i16, i16, i16)
+define i16 @fshr_i16_3rd_arg_var(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: 'fshr_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %r
+}
 
 define i32 @fshr_i32_3rd_arg_const(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'fshr_i32_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
-  ret i32 %fshr
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
+  ret i32 %r
 }
 
 define i32 @fshr_i32_3rd_arg_var(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: 'fshr_i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
-  ret i32 %fshr
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %r
 }
 
-declare i32 @llvm.fshr.i32(i32, i32, i32)
-
 define i64 @fshr_i64_3rd_arg_const(i64 %a, i64 %b) {
 ; CHECK-LABEL: 'fshr_i64_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
-  ret i64 %fshr
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
+  ret i64 %r
 }
 
 define i64 @fshr_i64_3rd_arg_var(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: 'fshr_i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %r
+}
+
+define i128 @fshr_i128_3rd_arg_const(i128 %a, i128 %b) {
+; CHECK-LABEL: 'fshr_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
-  ret i64 %fshr
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
+  ret i128 %r
 }
 
-declare i64 @llvm.fshr.i64(i64, i64, i64)
+define i128 @fshr_i128_3rd_arg_var(i128 %a, i128 %b, i128 %c) {
+; CHECK-LABEL: 'fshr_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:9 SizeLat:9 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 %c)
+  ret i128 %r
+}
 
 define i19 @fshr_i19(i19 %a, i19 %b) {
 ; CHECK-LABEL: 'fshr_i19'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %r
 ;
 entry:
-  %fshr = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
-  ret i19 %fshr
+  %r = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
+  ret i19 %r
 }
 
-declare i19 @llvm.fshr.i19(i19, i19, i19)
+define i66 @fshr_i66(i66 %a, i66 %b) {
+; CHECK-LABEL: 'fshr_i66'
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %r = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %r
+;
+entry:
+  %r = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
+  ret i66 %r
+}
 
 
 define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %fshr
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-  ret <16 x i8> %fshr
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshr_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %fshr
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %r
 }
 
-declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
 define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %fshr
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-  ret <8 x i16> %fshr
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshr_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-  ret <8 x i16> %fshr
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+  ret <8 x i16> %r
 }
 
-declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
 define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %fshr
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-  ret <4 x i32> %fshr
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshr_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-  ret <4 x i32> %fshr
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %r
 }
 
-declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
 define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshr = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
-  ret <2 x i64> %fshr
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshr = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-  ret <2 x i64> %fshr
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshr_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshr = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-  ret <2 x i64> %fshr
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %r
 }
 
-declare <2 x i64> @llvm.fshr.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
 define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %r = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %r
 ;
 entry:
-  %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-  ret <4 x i30> %fshr
+  %r = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+  ret <4 x i30> %r
 }
 
-declare <4 x i30> @llvm.fshr.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
-
 define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %r
 ;
 entry:
-  %fshr = tail call <2 x i66> @llvm.fshr.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-  ret <2 x i66> %fshr
+  %r = tail call <2 x i66> @llvm.fshr.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+  ret <2 x i66> %r
 }
-declare <2 x i66> @llvm.fshr.v4i66(<2 x i66>, <2 x i66>, <2 x i66>)
 
-define i66 @fshr_i66(i66 %a, i66 %b) {
-; CHECK-LABEL: 'fshr_i66'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %fshr = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %fshr
+define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a, <2 x i128> %b) {
+; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
 ;
 entry:
-  %fshr = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
-  ret i66 %fshr
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
 }
 
-declare i66 @llvm.fshr.i66(i66, i66, i66)
-
 define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @fshr_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
+; CHECK-LABEL: 'fshr_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:17 Lat:21 SizeLat:21 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+  ret <2 x i128> %r
+}
+
+
+; Rotate tests
+
+define i8 @rotl_i8_3rd_arg_const(i8 %a) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 9)
+  ret i8 %r
+}
+
+define i8 @rotl_i8_3rd_arg_var(i8 %a, i8 %c) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 %c)
+  ret i8 %r
+}
+
+define i16 @rotl_i16_3rd_arg_const(i16 %a) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 9)
+  ret i16 %r
+}
+
+define i16 @rotl_i16_3rd_arg_var(i16 %a, i16 %c) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 %c)
+  ret i16 %r
+}
+
+define i32 @rotl_i32_3rd_arg_const(i32 %a) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
+  ret i32 %r
+}
+
+define i32 @rotl_i32_3rd_arg_var(i32 %a, i32 %c) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %c)
+  ret i32 %r
+}
+
+define i64 @rotl_i64_3rd_arg_const(i64 %a) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+  ret i64 %r
+}
+
+define i64 @rotl_i64_3rd_arg_var(i64 %a, i64 %c) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshr = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-  ret <2 x i128> %fshr
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %c)
+  ret i64 %r
 }
-declare <2 x i128> @llvm.fshr.v4i128(<2 x i128>, <2 x i128>, <2 x i128>)
 
-define i128 @fshr_i128(i128 %a, i128 %b) {
-; CHECK-LABEL: 'fshr_i128'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %fshr = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %fshr
+define i128 @rotl_i128_3rd_arg_const(i128 %a) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshr = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
-  ret i128 %fshr
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 9)
+  ret i128 %r
 }
 
-declare i128 @llvm.fshr.i128(i128, i128, i128)
+define i128 @rotl_i128_3rd_arg_var(i128 %a, i128 %c) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %c)
+  ret i128 %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %c) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %c) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %c) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %c) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+  ret <2 x i64> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %c) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+  ret <2 x i128> %r
+}

From ad0acf4af001a3781b41b572788adcd7d652d18a Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Mon, 24 Nov 2025 15:57:07 +0100
Subject: [PATCH 17/19] AMDGPU/GlobalISel: Combine S16
 copy-trunc-readanylane-anyext (#168410)

---
 llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp          | 8 ++++++++
 .../CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll     | 7 -------
 llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll       | 6 ------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 907f8300de6d2..396d64625fb5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -173,6 +173,14 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
   if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
     return RALSrc;
 
+  // RALSrc = G_ANYEXT S16Src
+  // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+  // Src = G_TRUNC TruncSrc
+  if (mi_match(Src, MRI,
+               m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) {
+    return RALSrc;
+  }
+
   // TruncSrc = G_AMDGPU_READANYLANE RALSrc
   // AextSrc = G_TRUNC TruncSrc
   // Src = G_ANYEXT AextSrc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
index 4361e5c113708..27005e7aa175e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
@@ -1070,9 +1070,6 @@ define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr a
 ; GFX11-True16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-True16-NEXT:    ds_load_u16_d16 v1, v1
 ; GFX11-True16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-True16-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX11-True16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-True16-NEXT:    v_mov_b16_e32 v1.l, s0
 ; GFX11-True16-NEXT:    ds_store_b16 v0, v1
 ; GFX11-True16-NEXT:    s_endpgm
 ;
@@ -1089,10 +1086,6 @@ define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr a
 ; GFX12-True16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-True16-NEXT:    ds_load_u16_d16 v1, v1
 ; GFX12-True16-NEXT:    s_wait_dscnt 0x0
-; GFX12-True16-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX12-True16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-True16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-True16-NEXT:    v_mov_b16_e32 v1.l, s0
 ; GFX12-True16-NEXT:    ds_store_b16 v0, v1
 ; GFX12-True16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
index bf36deac33380..9bf140cf744db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
@@ -13,9 +13,6 @@ define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, p
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_load_d16_b16 v2, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b16_e32 v2.l, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -312,9 +309,6 @@ define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, p
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_load_d16_b16 v2, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b16_e32 v2.l, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
 ;

From 71952df1f52c8d54ea00a9e836184ba0ece7c6c3 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Tue, 25 Nov 2025 00:20:48 +0900
Subject: [PATCH 18/19] [OpenMP][SPIRV] Disable exceptions for OpenMP SPIR-V
 (#169094)

More missed target checks.

Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
---
 clang/lib/CodeGen/CGException.cpp                       | 4 ++--
 clang/lib/Frontend/CompilerInvocation.cpp               | 3 +--
 clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp | 9 +++++++++
 3 files changed, 12 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp

diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index f86af4581c345..e9d20672ce185 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -450,7 +450,7 @@ void CodeGenFunction::EmitCXXThrowExpr(const CXXThrowExpr *E,
   // Therefore, we emit a trap which will abort the program, and
   // prompt a warning indicating that a trap will be emitted.
   const llvm::Triple &T = Target.getTriple();
-  if (CGM.getLangOpts().OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN())) {
+  if (CGM.getLangOpts().OpenMPIsTargetDevice && T.isGPU()) {
     EmitTrapCall(llvm::Intrinsic::trap);
     return;
   }
@@ -627,7 +627,7 @@ void CodeGenFunction::EmitCXXTryStmt(const CXXTryStmt &S) {
   // If we encounter a try statement on in an OpenMP target region offloaded to
   // a GPU, we treat it as a basic block.
   const bool IsTargetDevice =
-      (CGM.getLangOpts().OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN()));
+      (CGM.getLangOpts().OpenMPIsTargetDevice && T.isGPU());
   if (!IsTargetDevice)
     EnterCXXTryStmt(S);
   EmitStmt(S.getTryBlock());
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 44486549b946c..5f1c7afdc80a3 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4293,8 +4293,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
   // Set the flag to prevent the implementation from emitting device exception
   // handling code for those requiring so.
-  if ((Opts.OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN())) ||
-      Opts.OpenCLCPlusPlus) {
+  if ((Opts.OpenMPIsTargetDevice && T.isGPU()) || Opts.OpenCLCPlusPlus) {
 
     Opts.Exceptions = 0;
     Opts.CXXExceptions = 0;
diff --git a/clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp b/clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp
new file mode 100644
index 0000000000000..42f8f3ea70f7d
--- /dev/null
+++ b/clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fexceptions -fcxx-exceptions -Wno-openmp-target-exception -fopenmp -x c++ -triple x86_64-unknown-linux -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc
+// RUN: %clang_cc1 -fexceptions -fcxx-exceptions -Wno-openmp-target-exception -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - | \
+// RUN: FileCheck -implicit-check-not='{{invoke|throw|cxa}}' %s
+void foo() {
+  // CHECK: call addrspace(9) void @llvm.trap()
+  // CHECK-NEXT: call spir_func addrspace(9) void @__kmpc_target_deinit()
+  #pragma omp target
+  throw "bad";
+}

From d542dce0e6e65d8943c31fc99391572c0287128a Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Mon, 24 Nov 2025 07:25:33 -0800
Subject: [PATCH 19/19] [OpenACC][CIR] copyin lowering for func-local- declare
 (#169336)

This is exactly like the 'copy', except the exit operation is a 'delete'
instead of a 'copyout'. Also, creating the 'delete' op has one less
argument to it, so we have to do some special handling when creating
that.
---
 clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp   |  24 ++-
 clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp |  10 +-
 .../CIR/CodeGenOpenACC/declare-copyin.cpp     | 199 ++++++++++++++++++
 3 files changed, 224 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
index 581a6ca81e2c4..40888e7326659 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
@@ -28,12 +28,21 @@ struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
 
   template <typename OutTy, typename InTy>
   void createOutOp(CIRGenFunction &cgf, InTy inOp) {
-    auto outOp =
-        OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp, inOp.getVarPtr(),
-                      inOp.getStructured(), inOp.getImplicit(),
-                      llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
-    outOp.setDataClause(inOp.getDataClause());
-    outOp.setModifiers(inOp.getModifiers());
+    if constexpr (std::is_same_v<OutTy, mlir::acc::DeleteOp>) {
+      auto outOp =
+          OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp,
+                        inOp.getStructured(), inOp.getImplicit(),
+                        llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+      outOp.setDataClause(inOp.getDataClause());
+      outOp.setModifiers(inOp.getModifiers());
+    } else {
+      auto outOp =
+          OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp, inOp.getVarPtr(),
+                        inOp.getStructured(), inOp.getImplicit(),
+                        llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+      outOp.setDataClause(inOp.getDataClause());
+      outOp.setModifiers(inOp.getModifiers());
+    }
   }
 
   void emit(CIRGenFunction &cgf) override {
@@ -52,6 +61,9 @@ struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
         case mlir::acc::DataClause::acc_copy:
           createOutOp<mlir::acc::CopyoutOp>(cgf, copyin);
           break;
+        case mlir::acc::DataClause::acc_copyin:
+          createOutOp<mlir::acc::DeleteOp>(cgf, copyin);
+          break;
         }
       } else if (val.getDefiningOp<mlir::acc::DeclareLinkOp>()) {
         // Link has no exit clauses, and shouldn't be copied.
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 621af2344209f..1e7a332d1dc22 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -826,12 +826,16 @@ class OpenACCClauseCIREmitter final
         addDataOperand<mlir::acc::CopyinOp>(
             var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
             /*structured=*/false, /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CopyinOp>(
+            var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyInClause");
     }
   }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp b/clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp
new file mode 100644
index 0000000000000..1ed7a7d101adb
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare copyin(always:ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copyin(alwaysin:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare copyin(alwaysin:LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copyin(always:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare copyin(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_copyin>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+}
+