diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 86da323892f98..15edb7e77a22b 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -250,15 +250,20 @@ class SemaHLSL : public SemaBase {
                                                const RecordType *RT);
 
   void checkSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param,
-                               const HLSLAppliedSemanticAttr *SemanticAttr);
+                               const HLSLAppliedSemanticAttr *SemanticAttr,
+                               bool IsInput);
+
   bool determineActiveSemanticOnScalar(FunctionDecl *FD,
                                        DeclaratorDecl *OutputDecl,
                                        DeclaratorDecl *D,
                                        SemanticInfo &ActiveSemantic,
-                                       llvm::StringSet<> &ActiveInputSemantics);
+                                       llvm::StringSet<> &ActiveSemantics,
+                                       bool IsInput);
+
   bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *OutputDecl,
                                DeclaratorDecl *D, SemanticInfo &ActiveSemantic,
-                               llvm::StringSet<> &ActiveInputSemantics);
+                               llvm::StringSet<> &ActiveSemantics,
+                               bool IsInput);
 
   void processExplicitBindingsOnDecl(VarDecl *D);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
index 551027bb1c8eb..40888e7326659 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
@@ -19,18 +19,64 @@ using namespace clang::CIRGen;
 
 namespace {
 struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
+  SourceRange declareRange;
   mlir::acc::DeclareEnterOp enterOp;
 
-  OpenACCDeclareCleanup(mlir::acc::DeclareEnterOp enterOp) : enterOp(enterOp) {}
+  OpenACCDeclareCleanup(SourceRange declareRange,
+                        mlir::acc::DeclareEnterOp enterOp)
+      : declareRange(declareRange), enterOp(enterOp) {}
+
+  template <typename OutTy, typename InTy>
+  void createOutOp(CIRGenFunction &cgf, InTy inOp) {
+    if constexpr (std::is_same_v<OutTy, mlir::acc::DeleteOp>) {
+      auto outOp =
+          OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp,
+                        inOp.getStructured(), inOp.getImplicit(),
+                        llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+      outOp.setDataClause(inOp.getDataClause());
+      outOp.setModifiers(inOp.getModifiers());
+    } else {
+      auto outOp =
+          OutTy::create(cgf.getBuilder(), inOp.getLoc(), inOp, inOp.getVarPtr(),
+                        inOp.getStructured(), inOp.getImplicit(),
+                        llvm::Twine(inOp.getNameAttr()), inOp.getBounds());
+      outOp.setDataClause(inOp.getDataClause());
+      outOp.setModifiers(inOp.getModifiers());
+    }
+  }
 
   void emit(CIRGenFunction &cgf) override {
-    mlir::acc::DeclareExitOp::create(cgf.getBuilder(), enterOp.getLoc(),
-                                     enterOp, {});
+    auto exitOp = mlir::acc::DeclareExitOp::create(
+        cgf.getBuilder(), enterOp.getLoc(), enterOp, {});
 
-    // TODO(OpenACC): Some clauses require that we add info about them to the
-    // DeclareExitOp.  However, we don't have any of those implemented yet, so
-    // we should add infrastructure here to do that once we have one
-    // implemented.
+    // Some data clauses need to be referenced in 'exit', AND need to have an
+    // operation after the exit.  Copy these from the enter operation.
+    for (mlir::Value val : enterOp.getDataClauseOperands()) {
+      if (auto copyin = val.getDefiningOp<mlir::acc::CopyinOp>()) {
+        switch (copyin.getDataClause()) {
+        default:
+          cgf.cgm.errorNYI(declareRange,
+                           "OpenACC local declare clause copyin cleanup");
+          break;
+        case mlir::acc::DataClause::acc_copy:
+          createOutOp<mlir::acc::CopyoutOp>(cgf, copyin);
+          break;
+        case mlir::acc::DataClause::acc_copyin:
+          createOutOp<mlir::acc::DeleteOp>(cgf, copyin);
+          break;
+        }
+      } else if (val.getDefiningOp<mlir::acc::DeclareLinkOp>()) {
+        // Link has no exit clauses, and shouldn't be copied.
+        continue;
+      } else if (val.getDefiningOp<mlir::acc::DevicePtrOp>()) {
+        // DevicePtr has no exit clauses, and shouldn't be copied.
+        continue;
+      } else {
+        cgf.cgm.errorNYI(declareRange, "OpenACC local declare clause cleanup");
+        continue;
+      }
+      exitOp.getDataClauseOperandsMutable().append(val);
+    }
   }
 };
 } // namespace
@@ -45,7 +91,7 @@ void CIRGenFunction::emitOpenACCDeclare(const OpenACCDeclareDecl &d) {
                      d.clauses());
 
   ehStack.pushCleanup<OpenACCDeclareCleanup>(CleanupKind::NormalCleanup,
-                                             enterOp);
+                                             d.getSourceRange(), enterOp);
 }
 
 void CIRGenFunction::emitOpenACCRoutine(const OpenACCRoutineDecl &d) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index c5c6bcd0153a4..1e7a332d1dc22 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -800,12 +800,16 @@ class OpenACCClauseCIREmitter final
             var, mlir::acc::DataClause::acc_copy, clause.getModifierList(),
             /*structured=*/true,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CopyinOp>(
+            var, mlir::acc::DataClause::acc_copy, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyClause");
     }
   }
 
@@ -822,12 +826,16 @@ class OpenACCClauseCIREmitter final
         addDataOperand<mlir::acc::CopyinOp>(
             var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
             /*structured=*/false, /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CopyinOp>(
+            var, mlir::acc::DataClause::acc_copyin, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyInClause");
     }
   }
 
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index f86af4581c345..e9d20672ce185 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -450,7 +450,7 @@ void CodeGenFunction::EmitCXXThrowExpr(const CXXThrowExpr *E,
   // Therefore, we emit a trap which will abort the program, and
   // prompt a warning indicating that a trap will be emitted.
   const llvm::Triple &T = Target.getTriple();
-  if (CGM.getLangOpts().OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN())) {
+  if (CGM.getLangOpts().OpenMPIsTargetDevice && T.isGPU()) {
     EmitTrapCall(llvm::Intrinsic::trap);
     return;
   }
@@ -627,7 +627,7 @@ void CodeGenFunction::EmitCXXTryStmt(const CXXTryStmt &S) {
   // If we encounter a try statement on in an OpenMP target region offloaded to
   // a GPU, we treat it as a basic block.
   const bool IsTargetDevice =
-      (CGM.getLangOpts().OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN()));
+      (CGM.getLangOpts().OpenMPIsTargetDevice && T.isGPU());
   if (!IsTargetDevice)
     EnterCXXTryStmt(S);
   EmitStmt(S.getTryBlock());
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index d281c4c20616a..bca7c30557f03 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -320,7 +320,7 @@ class ComplexExprEmitter
   QualType getPromotionType(FPOptionsOverride Features, QualType Ty,
                             bool IsComplexDivisor) {
     if (auto *CT = Ty->getAs<ComplexType>()) {
-      QualType ElementType = CT->getElementType();
+      QualType ElementType = CT->getElementType().getCanonicalType();
       bool IsFloatingType = ElementType->isFloatingType();
       bool IsComplexRangePromoted = CGF.getLangOpts().getComplexRange() ==
                                     LangOptions::ComplexRangeKind::CX_Promoted;
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 2a5f3f6895609..f5c07fe2e33ff 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -731,13 +731,22 @@ llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad(
   }
 
   if (SemanticName == "SV_POSITION") {
-    if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Pixel)
-      return createSPIRVBuiltinLoad(B, CGM.getModule(), Type,
-                                    Semantic->getAttrName()->getName(),
-                                    /* BuiltIn::FragCoord */ 15);
+    if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Pixel) {
+      if (CGM.getTarget().getTriple().isSPIRV())
+        return createSPIRVBuiltinLoad(B, CGM.getModule(), Type,
+                                      Semantic->getAttrName()->getName(),
+                                      /* BuiltIn::FragCoord */ 15);
+      if (CGM.getTarget().getTriple().isDXIL())
+        return emitDXILUserSemanticLoad(B, Type, Semantic, Index);
+    }
+
+    if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Vertex) {
+      return emitUserSemanticLoad(B, Type, Decl, Semantic, Index);
+    }
   }
 
-  llvm_unreachable("non-handled system semantic. FIXME.");
+  llvm_unreachable(
+      "Load hasn't been implemented yet for this system semantic. FIXME");
 }
 
 static void createSPIRVBuiltinStore(IRBuilder<> &B, llvm::Module &M,
@@ -760,12 +769,22 @@ void CGHLSLRuntime::emitSystemSemanticStore(IRBuilder<> &B, llvm::Value *Source,
                                             std::optional<unsigned> Index) {
 
   std::string SemanticName = Semantic->getAttrName()->getName().upper();
-  if (SemanticName == "SV_POSITION")
-    createSPIRVBuiltinStore(B, CGM.getModule(), Source,
-                            Semantic->getAttrName()->getName(),
-                            /* BuiltIn::Position */ 0);
-  else
-    llvm_unreachable("non-handled system semantic. FIXME.");
+  if (SemanticName == "SV_POSITION") {
+    if (CGM.getTarget().getTriple().isDXIL()) {
+      emitDXILUserSemanticStore(B, Source, Semantic, Index);
+      return;
+    }
+
+    if (CGM.getTarget().getTriple().isSPIRV()) {
+      createSPIRVBuiltinStore(B, CGM.getModule(), Source,
+                              Semantic->getAttrName()->getName(),
+                              /* BuiltIn::Position */ 0);
+      return;
+    }
+  }
+
+  llvm_unreachable(
+      "Store hasn't been implemented yet for this system semantic. FIXME");
 }
 
 llvm::Value *CGHLSLRuntime::handleScalarSemanticLoad(
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index cbc5931390376..9d7c851bead3e 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4360,8 +4360,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
   // Set the flag to prevent the implementation from emitting device exception
   // handling code for those requiring so.
-  if ((Opts.OpenMPIsTargetDevice && (T.isNVPTX() || T.isAMDGCN())) ||
-      Opts.OpenCLCPlusPlus) {
+  if ((Opts.OpenMPIsTargetDevice && T.isGPU()) || Opts.OpenCLCPlusPlus) {
 
     Opts.Exceptions = 0;
     Opts.CXXExceptions = 0;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 1a2fe0b3c17a7..cd14000c6d3df 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -10743,7 +10743,7 @@ static void DetectPrecisionLossInComplexDivision(Sema &S, QualType DivisorTy,
   if (!CT)
     return;
 
-  QualType ElementType = CT->getElementType();
+  QualType ElementType = CT->getElementType().getCanonicalType();
   bool IsComplexRangePromoted = S.getLangOpts().getComplexRange() ==
                                 LangOptions::ComplexRangeKind::CX_Promoted;
   if (!ElementType->isFloatingType() || !IsComplexRangePromoted)
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 0a164a7b5bbbd..ecab3946b58c7 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -771,9 +771,12 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) {
   }
 }
 
-bool SemaHLSL::determineActiveSemanticOnScalar(
-    FunctionDecl *FD, DeclaratorDecl *OutputDecl, DeclaratorDecl *D,
-    SemanticInfo &ActiveSemantic, llvm::StringSet<> &UsedSemantics) {
+bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
+                                               DeclaratorDecl *OutputDecl,
+                                               DeclaratorDecl *D,
+                                               SemanticInfo &ActiveSemantic,
+                                               llvm::StringSet<> &UsedSemantics,
+                                               bool IsInput) {
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLParsedSemanticAttr>();
     if (ActiveSemantic.Semantic)
@@ -792,7 +795,7 @@ bool SemaHLSL::determineActiveSemanticOnScalar(
   if (!A)
     return false;
 
-  checkSemanticAnnotation(FD, D, A);
+  checkSemanticAnnotation(FD, D, A, IsInput);
   OutputDecl->addAttr(A);
 
   unsigned Location = ActiveSemantic.Index.value_or(0);
@@ -820,7 +823,8 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD,
                                        DeclaratorDecl *OutputDecl,
                                        DeclaratorDecl *D,
                                        SemanticInfo &ActiveSemantic,
-                                       llvm::StringSet<> &UsedSemantics) {
+                                       llvm::StringSet<> &UsedSemantics,
+                                       bool IsInput) {
   if (ActiveSemantic.Semantic == nullptr) {
     ActiveSemantic.Semantic = D->getAttr<HLSLParsedSemanticAttr>();
     if (ActiveSemantic.Semantic)
@@ -833,12 +837,13 @@ bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD,
   const RecordType *RT = dyn_cast<RecordType>(T);
   if (!RT)
     return determineActiveSemanticOnScalar(FD, OutputDecl, D, ActiveSemantic,
-                                           UsedSemantics);
+                                           UsedSemantics, IsInput);
 
   const RecordDecl *RD = RT->getDecl();
   for (FieldDecl *Field : RD->fields()) {
     SemanticInfo Info = ActiveSemantic;
-    if (!determineActiveSemantic(FD, OutputDecl, Field, Info, UsedSemantics)) {
+    if (!determineActiveSemantic(FD, OutputDecl, Field, Info, UsedSemantics,
+                                 IsInput)) {
       Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field;
       return false;
     }
@@ -920,7 +925,7 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
 
     // FIXME: Verify output semantics in parameters.
     if (!determineActiveSemantic(FD, Param, Param, ActiveSemantic,
-                                 ActiveInputSemantics)) {
+                                 ActiveInputSemantics, /* IsInput= */ true)) {
       Diag(Param->getLocation(), diag::note_previous_decl) << Param;
       FD->setInvalidDecl();
     }
@@ -932,12 +937,13 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
   if (ActiveSemantic.Semantic)
     ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex();
   if (!FD->getReturnType()->isVoidType())
-    determineActiveSemantic(FD, FD, FD, ActiveSemantic, ActiveOutputSemantics);
+    determineActiveSemantic(FD, FD, FD, ActiveSemantic, ActiveOutputSemantics,
+                            /* IsInput= */ false);
 }
 
 void SemaHLSL::checkSemanticAnnotation(
     FunctionDecl *EntryPoint, const Decl *Param,
-    const HLSLAppliedSemanticAttr *SemanticAttr) {
+    const HLSLAppliedSemanticAttr *SemanticAttr, bool IsInput) {
   auto *ShaderAttr = EntryPoint->getAttr<HLSLShaderAttr>();
   assert(ShaderAttr && "Entry point has no shader attribute");
   llvm::Triple::EnvironmentType ST = ShaderAttr->getType();
@@ -961,11 +967,12 @@ void SemaHLSL::checkSemanticAnnotation(
   }
 
   if (SemanticName == "SV_POSITION") {
-    // TODO(#143523): allow use on other shader types & output once the overall
-    // semantic logic is implemented.
-    if (ST == llvm::Triple::Pixel)
+    // SV_Position can be an input or output in vertex shaders,
+    // but only an input in pixel shaders.
+    if (ST == llvm::Triple::Vertex || (ST == llvm::Triple::Pixel && IsInput))
       return;
-    DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel});
+    DiagnoseAttrStageMismatch(SemanticAttr, ST,
+                              {llvm::Triple::Pixel, llvm::Triple::Vertex});
     return;
   }
 
diff --git a/clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl b/clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl
new file mode 100644
index 0000000000000..d4d89bd5d26ba
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-input-struct-shadow.hlsl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:       FunctionDecl {{.*}} main 'void (S)'
+// CHECK-NEXT:  ParmVarDecl {{.*}} s 'S'
+// CHECK-NEXT:  HLSLParsedSemanticAttr {{.*}} "C" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "C" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "C" 1
+void main(S s : C) {}
diff --git a/clang/test/AST/HLSL/semantic-input-struct.hlsl b/clang/test/AST/HLSL/semantic-input-struct.hlsl
new file mode 100644
index 0000000000000..d71fdcff631f4
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-input-struct.hlsl
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:       FunctionDecl {{.*}} main 'void (S)'
+// CHECK-NEXT:  ParmVarDecl {{.*}} s 'S'
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "A" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "B" 4
+void main(S s) {}
diff --git a/clang/test/AST/HLSL/semantic-input.hlsl b/clang/test/AST/HLSL/semantic-input.hlsl
new file mode 100644
index 0000000000000..4dc3ab9db7392
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-input.hlsl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+// CHECK:      ParmVarDecl {{.*}} a 'float4':'vector<float, 4>'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "ABC" 0
+// CHECK-NEXT: HLSLAppliedSemanticAttr {{.*}} "ABC" 0
+
+void main(float4 a : ABC) {
+}
diff --git a/clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl b/clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl
new file mode 100644
index 0000000000000..e83901bb17943
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-output-struct-shadow.hlsl
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} referenced field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} referenced field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:      FunctionDecl {{.*}} main 'S ()'
+// CHECK:       HLSLParsedSemanticAttr {{.*}} "DEF" 0
+// CHECK:       HLSLAppliedSemanticAttr {{.*}} "DEF" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "DEF" 1
+S main() : DEF {
+  S tmp;
+  return tmp;
+}
diff --git a/clang/test/AST/HLSL/semantic-output-struct.hlsl b/clang/test/AST/HLSL/semantic-output-struct.hlsl
new file mode 100644
index 0000000000000..727c0f3040641
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-output-struct.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+
+// CHECK: CXXRecordDecl {{.*}} referenced struct S definition
+// CHECK: FieldDecl {{.*}} referenced field1 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "A" 0
+// CHECK: FieldDecl {{.*}} referenced field2 'int'
+// CHECK-NEXT: HLSLParsedSemanticAttr {{.*}} "B" 4
+
+struct S {
+  int field1 : A;
+  int field2 : B4;
+};
+
+// CHECK:      FunctionDecl {{.*}} main 'S ()'
+// CHECK:       HLSLAppliedSemanticAttr {{.*}} "A" 0
+// CHECK-NEXT:  HLSLAppliedSemanticAttr {{.*}} "B" 4
+S main() {
+  S tmp;
+  return tmp;
+}
diff --git a/clang/test/AST/HLSL/semantic-output.hlsl b/clang/test/AST/HLSL/semantic-output.hlsl
new file mode 100644
index 0000000000000..63429387f8d66
--- /dev/null
+++ b/clang/test/AST/HLSL/semantic-output.hlsl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.8-vertex -finclude-default-header -ast-dump -o - %s | FileCheck %s
+
+// CHECK: FunctionDecl {{.*}} main 'uint ()'
+// CHECK:  HLSLParsedSemanticAttr {{.*}} "ABC" 0
+// CHECK:  HLSLAppliedSemanticAttr {{.*}} "ABC" 0
+uint main() : ABC {
+  return 0;
+}
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-copy.cpp b/clang/test/CIR/CodeGenOpenACC/declare-copy.cpp
new file mode 100644
index 0000000000000..a8a9115a21b29
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-copy.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare copy(always:ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copy(alwaysin:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare copy(alwaysout:LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copy(capture:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare copy(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copy>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copy>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {dataClause = #acc<data_clause acc_copy>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {dataClause = #acc<data_clause acc_copy>, modifiers = #acc<data_clause_modifier capture>, name = "ArgHSEPtr[1:1]"}
+}
+
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp b/clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp
new file mode 100644
index 0000000000000..1ed7a7d101adb
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-copyin.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare copyin(always:ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copyin(alwaysin:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare copyin(alwaysin:LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier alwaysin>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copyin(always:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_COPYIN:.*]] = acc.copyin varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare copyin(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_COPYIN:.*]] = acc.copyin varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_COPYIN]], %[[LOC_INT_COPYIN]], %[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_COPYIN]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_copyin>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_COPYIN]], %[[ARG_INT_COPYIN]], %[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_COPYIN]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_COPYIN]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_COPYIN]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_copyin>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+}
+
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 13ad0545ab53f..6a884e98e9f3b 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -321,10 +321,10 @@ __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
   // CHECK: icmp eq <32 x i8>
   return _mm256_cmpeq_epi8(a, b);
 }
-TEST_CONSTEXPR(match_v16qi(_mm_cmpeq_epi8(
-    (__m128i)(__v16qs){1,-2,3,-4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16},
-    (__m128i)(__v16qs){10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16}),
-    0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1));
+TEST_CONSTEXPR(match_v32qi(_mm256_cmpeq_epi8(
+    (__m256i)(__v32qs){1,-2,3,-4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-16,15,-14,13,-12,11,-10,9,-8,7,-6,5,4,-3,2,-1},
+    (__m256i)(__v32qs){10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16,10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16}),
+    0, -1, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi16
diff --git a/clang/test/CodeGen/promoted-complex-div.c b/clang/test/CodeGen/promoted-complex-div.c
index 7ed7b07db83ae..006b5e334e6ea 100644
--- a/clang/test/CodeGen/promoted-complex-div.c
+++ b/clang/test/CodeGen/promoted-complex-div.c
@@ -81,3 +81,55 @@ _Complex double divf(_Complex double a, _Complex double b) {
 
   return a / b; // nopromotion-warning{{excess precision is requested but the target does not support excess precision which may result in observable differences in complex division behavior}}
 }
+
+// This test ensures that Clang does not crash when complex element types
+// require desugaring under -complex-range=promoted. Previously, a sugared
+// typedef element type (e.g., 'typedef double a') caused a crash during
+// complex range evaluation in both Sema and CodeGen.
+typedef double a;
+_Complex double *b;
+// CHECK-LABEL: define dso_local void @DivideByComplexZero
+void DivideByComplexZero() {
+  // CHECK: fpext double {{.*}} to x86_fp80
+  // CHECK: fpext double {{.*}} to x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fadd x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fmul x86_fp80
+  // CHECK: fsub x86_fp80
+  // CHECK: fdiv x86_fp80
+  // CHECK: fdiv x86_fp80
+  // CHECK: fptrunc x86_fp80
+  // CHECK: fptrunc x86_fp80
+
+  // NOX87: call double @llvm.fabs.f64(double {{.*}})
+  // NOX87-NEXT: call double @llvm.fabs.f64(double {{.*}}
+  // NOX87-NEXT: fcmp ugt double {{.*}}, {{.*}}
+  // NOX87-NEXT: br i1 {{.*}}, label
+  // NOX87: abs_rhsr_greater_or_equal_abs_rhsi:
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fadd double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fsub double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: br label {{.*}}
+  // NOX87: abs_rhsr_less_than_abs_rhsi:
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fadd double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: fmul double
+  // NOX87-NEXT: fsub double
+  // NOX87-NEXT: fdiv double
+  // NOX87-NEXT: br label {{.*}}
+  // NOX87: complex_div:
+  // NOX87-NEXT: phi double
+  // NOX87-NEXT: phi double
+  // NOX87-NEXT: getelementptr inbounds nuw { double, double }, ptr {{.*}}, i32 0, i32 0
+  // NOX87-NEXT: getelementptr inbounds nuw { double, double }, ptr {{.*}}, i32 0, i32 1
+  // NOX87-NEXT: store double
+  // NOX87-NEXT: store double
+
+  *b /= 1.0iF * (a)0;
+}
diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
index be30e79438831..b7d2283ea7766 100644
--- a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
@@ -1,11 +1,21 @@
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spirv-pc-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-DXIL
 
-// CHECK: @SV_Position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+// CHECK-SPIRV: @SV_Position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_0:]]
 
 // CHECK: define void @main() {{.*}} {
 float4 main(float4 p : SV_Position) : A {
-  // CHECK: %[[#P:]] = load <4 x float>, ptr addrspace(7) @SV_Position, align 16
-  // CHECK: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
-  // CHECK:            store <4 x float> %[[#R]], ptr addrspace(8) @A0, align 16
+  // CHECK-SPIRV: %[[#P:]] = load <4 x float>, ptr addrspace(7) @SV_Position, align 16
+  // CHECK-SPIRV: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
+  // CHECK-SPIRV:            store <4 x float> %[[#R]], ptr addrspace(8) @A0, align 16
+
+  // CHECK-DXIL: %SV_Position0 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+  // CHECK-DXIL:    %[[#TMP:]] = call <4 x float> @_Z4mainDv4_f(<4 x float> %SV_Position0)
+  // CHECK-DXIL:                 call void @llvm.dx.store.output.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison, <4 x float> %[[#TMP]])
   return p;
 }
+
+// CHECK-SPIRV-DAG: ![[#MD_0]] = !{![[#MD_1:]]}
+// CHECK-SPIRV-DAG: ![[#MD_1]] = !{i32 11, i32 15}
+//                                      |       `-> BuiltIn Position
+//                                      `-> SPIR-V decoration 'FragCoord'
diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl
new file mode 100644
index 0000000000000..0156c0bb816c1
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/SV_Position.vs.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.8-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck --check-prefix=CHECK-DXIL %s
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-vertex -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck --check-prefix=CHECK-SPIRV  %s
+
+// CHECK-SPIRV: @SV_Position0 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations ![[#MD_0:]]
+// CHECK-SPIRV: @SV_Position = external hidden thread_local addrspace(8) global <4 x float>, !spirv.Decorations ![[#MD_2:]]
+
+// CHECK: define void @main() {{.*}} {
+float4 main(float4 p : SV_Position) : SV_Position {
+  // CHECK-SPIRV: %[[#P:]] = load <4 x float>, ptr addrspace(7) @SV_Position0, align 16
+  // CHECK-SPIRV: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
+  // CHECK-SPIRV:            store <4 x float> %[[#R]], ptr addrspace(8) @SV_Position, align 16
+
+  // CHECK-DXIL: %SV_Position0 = call <4 x float> @llvm.dx.load.input.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison)
+  // CHECK-DXIL:    %[[#TMP:]] = call <4 x float> @_Z4mainDv4_f(<4 x float> %SV_Position0)
+  // CHECK-DXIL:                 call void @llvm.dx.store.output.v4f32(i32 4, i32 0, i32 0, i8 0, i32 poison, <4 x float> %[[#TMP]])
+  return p;
+}
+
+// CHECK-SPIRV-DAG: ![[#MD_0]] = !{![[#MD_1:]]}
+// CHECK-SPIRV-DAG: ![[#MD_2]] = !{![[#MD_3:]]}
+// CHECK-SPIRV-DAG: ![[#MD_1]] = !{i32 30, i32 0}
+//                                      |       `-> Location 0
+//                                      `-> SPIR-V decoration 'Location'
+// CHECK-SPIRV-DAG: ![[#MD_3]] = !{i32 11, i32 0}
+//                                      |       `-> BuiltIn Position
+//                                      `-> SPIR-V decoration 'BuiltIn'
diff --git a/clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp b/clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp
new file mode 100644
index 0000000000000..42f8f3ea70f7d
--- /dev/null
+++ b/clang/test/OpenMP/spirv_target_codegen_noexceptions.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fexceptions -fcxx-exceptions -Wno-openmp-target-exception -fopenmp -x c++ -triple x86_64-unknown-linux -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc
+// RUN: %clang_cc1 -fexceptions -fcxx-exceptions -Wno-openmp-target-exception -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - | \
+// RUN: FileCheck -implicit-check-not='{{invoke|throw|cxa}}' %s
+void foo() {
+  // CHECK: call addrspace(9) void @llvm.trap()
+  // CHECK-NEXT: call spir_func addrspace(9) void @__kmpc_target_deinit()
+  #pragma omp target
+  throw "bad";
+}
diff --git a/clang/test/SemaHLSL/Semantics/position.ps.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
index 2d02384821d90..47d07887911d6 100644
--- a/clang/test/SemaHLSL/Semantics/position.ps.hlsl
+++ b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
@@ -1,13 +1,7 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -finclude-default-header -x hlsl -verify -o - %s
+// RUN: %clang_cc1 -triple spirv-pc-vulkan1.3-pixel -finclude-default-header -x hlsl -verify -o - %s
 
-// FIXME(Keenuts): change output semantic to something valid for pixels shaders
-float4 main(float4 a : SV_Position2) : A {
-// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (float4)'
-// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 used a 'float4':'vector<float, 4>'
-// CHECK-NEXT:  HLSLParsedSemanticAttr 0x{{[0-9a-f]+}} <col:24> "SV_Position" 2
-// CHECK-NEXT:  HLSLAppliedSemanticAttr 0x{{[0-9a-f]+}} <col:24> "SV_Position" 2
-
-// CHECK:       HLSLParsedSemanticAttr 0x{{[0-9a-f]+}} <line:4:40> "A" 0
-// CHECK:       HLSLAppliedSemanticAttr 0x{{[0-9a-f]+}} <col:40> "A" 0
+float4 main(float4 a : A) : SV_Position {
+// expected-error@-1 {{attribute 'SV_Position' is unsupported in 'pixel' shaders, requires one of the following: pixel, vertex}}
   return a;
 }
diff --git a/clang/test/SemaHLSL/Semantics/position.vs.hlsl b/clang/test/SemaHLSL/Semantics/position.vs.hlsl
deleted file mode 100644
index 9d0ff285ce055..0000000000000
--- a/clang/test/SemaHLSL/Semantics/position.vs.hlsl
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-vertex -x hlsl -finclude-default-header -o - %s -verify
-
-// expected-error@+1 {{attribute 'SV_Position' is unsupported in 'vertex' shaders, requires pixel}}
-float4 main(float4 a : SV_Position) : A {
-  return a;
-}
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 272e52d68f46a..56c45d0d46575 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -732,50 +732,52 @@ public:
 
   _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v);
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT {
     return allocator_type(this->__alloc_);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
     return iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT {
+    return iterator(nullptr);
+  }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
     return const_iterator(nullptr);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
     return const_iterator(nullptr);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT {
     return iterator(__base::__before_begin());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
 
   [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
     return __base::__before_begin()->__next_ == nullptr;
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return std::min<size_type>(__node_traits::max_size(this->__alloc_), numeric_limits<difference_type>::max());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
diff --git a/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp
index 7594a1d299a50..671c7f71ab2a2 100644
--- a/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/forward_list.nodiscard.verify.cpp
@@ -13,6 +13,27 @@
 #include <forward_list>
 
 void test() {
-  std::forward_list<int> forward_list;
-  forward_list.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::forward_list<int> fl;
+  const std::forward_list<int> cfl;
+
+  fl.get_allocator(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  fl.begin();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.begin();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.end();            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.end();           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.cbegin();         // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.cbegin();        // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.cend();           // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.cend();          // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.before_begin();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.before_begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.cbefore_begin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.cbefore_begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  fl.empty();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  fl.max_size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  fl.front();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  cfl.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 }
diff --git a/lldb/docs/dil-expr-lang.ebnf b/lldb/docs/dil-expr-lang.ebnf
index 70eda3bf40650..ccd2b00223910 100644
--- a/lldb/docs/dil-expr-lang.ebnf
+++ b/lldb/docs/dil-expr-lang.ebnf
@@ -8,7 +8,7 @@ expression = unary_expression ;
 unary_expression = postfix_expression
                  | unary_operator expression ;
 
-unary_operator = "*" | "&" ;
+unary_operator = "*" | "&" | "+" | "-";
 
 postfix_expression = primary_expression
                    | postfix_expression "[" integer_literal "]"
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 25b208a65349b..99ea0585e5370 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -411,6 +411,18 @@ class TypeSystem : public PluginInterface,
   GetIntegralTemplateArgument(lldb::opaque_compiler_type_t type, size_t idx,
                               bool expand_pack);
 
+  // DIL
+
+  /// Checks if the type is eligible for integral promotion.
+  virtual bool IsPromotableIntegerType(lldb::opaque_compiler_type_t type);
+
+  /// Perform integral promotion on a given type.
+  /// This promotes eligible types (boolean, integers, unscoped enumerations)
+  /// to a larger integer type according to type system rules.
+  /// \returns Promoted type.
+  virtual llvm::Expected<CompilerType>
+  DoIntegralPromotion(CompilerType from, ExecutionContextScope *exe_scope);
+
   // Dumping types
 
 #ifndef NDEBUG
diff --git a/lldb/include/lldb/ValueObject/DILAST.h b/lldb/include/lldb/ValueObject/DILAST.h
index 0f05d753f1b56..91f8d93c09622 100644
--- a/lldb/include/lldb/ValueObject/DILAST.h
+++ b/lldb/include/lldb/ValueObject/DILAST.h
@@ -33,6 +33,8 @@ enum class NodeKind {
 enum class UnaryOpKind {
   AddrOf, // "&"
   Deref,  // "*"
+  Minus,  // "-"
+  Plus,   // "+"
 };
 
 /// Forward declaration, for use in DIL AST nodes. Definition is at the very
diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h
index eab3218ff828f..a65edc58cc4e7 100644
--- a/lldb/include/lldb/ValueObject/DILEval.h
+++ b/lldb/include/lldb/ValueObject/DILEval.h
@@ -61,6 +61,10 @@ class Interpreter : Visitor {
   llvm::Expected<lldb::ValueObjectSP>
   Visit(const BooleanLiteralNode *node) override;
 
+  /// Perform usual unary conversions on a value. At the moment this
+  /// includes array-to-pointer and integral promotion for eligible types.
+  llvm::Expected<lldb::ValueObjectSP>
+  UnaryConversion(lldb::ValueObjectSP valobj, uint32_t location);
   llvm::Expected<CompilerType>
   PickIntegerType(lldb::TypeSystemSP type_system,
                   std::shared_ptr<ExecutionContextScope> ctx,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 51cb883748514..aa8d309fbc730 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -7346,6 +7346,102 @@ CompilerType TypeSystemClang::GetTypeForFormatters(void *type) {
   return CompilerType();
 }
 
+bool TypeSystemClang::IsPromotableIntegerType(
+    lldb::opaque_compiler_type_t type) {
+  // Unscoped enums are always considered as promotable, even if their
+  // underlying type does not need to be promoted (e.g. "int").
+  bool is_signed = false;
+  bool isUnscopedEnumerationType =
+      IsEnumerationType(type, is_signed) && !IsScopedEnumerationType(type);
+  if (isUnscopedEnumerationType)
+    return true;
+
+  switch (GetBasicTypeEnumeration(type)) {
+  case lldb::eBasicTypeBool:
+  case lldb::eBasicTypeChar:
+  case lldb::eBasicTypeSignedChar:
+  case lldb::eBasicTypeUnsignedChar:
+  case lldb::eBasicTypeShort:
+  case lldb::eBasicTypeUnsignedShort:
+  case lldb::eBasicTypeWChar:
+  case lldb::eBasicTypeSignedWChar:
+  case lldb::eBasicTypeUnsignedWChar:
+  case lldb::eBasicTypeChar16:
+  case lldb::eBasicTypeChar32:
+    return true;
+
+  default:
+    return false;
+  }
+
+  llvm_unreachable("All cases handled above.");
+}
+
+llvm::Expected<CompilerType>
+TypeSystemClang::DoIntegralPromotion(CompilerType from,
+                                     ExecutionContextScope *exe_scope) {
+  if (!from.IsInteger() && !from.IsUnscopedEnumerationType())
+    return from;
+
+  if (!from.IsPromotableIntegerType())
+    return from;
+
+  if (from.IsUnscopedEnumerationType()) {
+    EnumDecl *enum_decl = GetAsEnumDecl(from);
+    CompilerType promotion_type = GetType(enum_decl->getPromotionType());
+    return DoIntegralPromotion(promotion_type, exe_scope);
+  }
+
+  lldb::BasicType builtin_type =
+      from.GetCanonicalType().GetBasicTypeEnumeration();
+  uint64_t from_size = 0;
+  if (builtin_type == lldb::eBasicTypeWChar ||
+      builtin_type == lldb::eBasicTypeSignedWChar ||
+      builtin_type == lldb::eBasicTypeUnsignedWChar ||
+      builtin_type == lldb::eBasicTypeChar16 ||
+      builtin_type == lldb::eBasicTypeChar32) {
+    // Find the type that can hold the entire range of values for our type.
+    bool is_signed = from.IsSigned();
+    llvm::Expected<uint64_t> from_size = from.GetByteSize(exe_scope);
+    if (!from_size)
+      return from_size.takeError();
+    CompilerType promote_types[] = {
+        GetBasicTypeFromAST(lldb::eBasicTypeInt),
+        GetBasicTypeFromAST(lldb::eBasicTypeUnsignedInt),
+        GetBasicTypeFromAST(lldb::eBasicTypeLong),
+        GetBasicTypeFromAST(lldb::eBasicTypeUnsignedLong),
+        GetBasicTypeFromAST(lldb::eBasicTypeLongLong),
+        GetBasicTypeFromAST(lldb::eBasicTypeUnsignedLongLong),
+    };
+    for (CompilerType &type : promote_types) {
+      llvm::Expected<uint64_t> byte_size = type.GetByteSize(exe_scope);
+      if (!byte_size)
+        return byte_size.takeError();
+      if (*from_size < *byte_size ||
+          (*from_size == *byte_size && is_signed == type.IsSigned())) {
+        return type;
+      }
+    }
+    llvm_unreachable("char type should fit into long long");
+  }
+
+  // Here we can promote only to "int" or "unsigned int".
+  CompilerType int_type = GetBasicTypeFromAST(lldb::eBasicTypeInt);
+  llvm::Expected<uint64_t> int_byte_size = int_type.GetByteSize(exe_scope);
+  if (!int_byte_size)
+    return int_byte_size.takeError();
+
+  // Signed integer types can be safely promoted to "int".
+  if (from.IsSigned()) {
+    return int_type;
+  }
+  // Unsigned integer types are promoted to "unsigned int" if "int" cannot hold
+  // their entire value range.
+  return (from_size == *int_byte_size)
+             ? GetBasicTypeFromAST(lldb::eBasicTypeUnsignedInt)
+             : int_type;
+}
+
 clang::EnumDecl *TypeSystemClang::GetAsEnumDecl(const CompilerType &type) {
   const clang::EnumType *enutype =
       llvm::dyn_cast<clang::EnumType>(ClangUtil::GetCanonicalQualType(type));
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 375891b3cfd2f..67d206e4d2df2 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -938,6 +938,14 @@ class TypeSystemClang : public TypeSystem {
 
   CompilerType GetTypeForFormatters(void *type) override;
 
+  // DIL
+
+  bool IsPromotableIntegerType(lldb::opaque_compiler_type_t type) override;
+
+  llvm::Expected<CompilerType>
+  DoIntegralPromotion(CompilerType from,
+                      ExecutionContextScope *exe_scope) override;
+
 #define LLDB_INVALID_DECL_LEVEL UINT32_MAX
   // LLDB_INVALID_DECL_LEVEL is returned by CountDeclLevels if child_decl_ctx
   // could not be found in decl_ctx.
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index c999ab256fc98..1a39ea9476390 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -370,30 +370,10 @@ bool CompilerType::IsScalarOrUnscopedEnumerationType() const {
 }
 
 bool CompilerType::IsPromotableIntegerType() const {
-  // Unscoped enums are always considered as promotable, even if their
-  // underlying type does not need to be promoted (e.g. "int").
-  if (IsUnscopedEnumerationType())
-    return true;
-
-  switch (GetBasicTypeEnumeration()) {
-  case lldb::eBasicTypeBool:
-  case lldb::eBasicTypeChar:
-  case lldb::eBasicTypeSignedChar:
-  case lldb::eBasicTypeUnsignedChar:
-  case lldb::eBasicTypeShort:
-  case lldb::eBasicTypeUnsignedShort:
-  case lldb::eBasicTypeWChar:
-  case lldb::eBasicTypeSignedWChar:
-  case lldb::eBasicTypeUnsignedWChar:
-  case lldb::eBasicTypeChar16:
-  case lldb::eBasicTypeChar32:
-    return true;
-
-  default:
-    return false;
-  }
-
-  llvm_unreachable("All cases handled above.");
+  if (IsValid())
+    if (auto type_system_sp = GetTypeSystem())
+      return type_system_sp->IsPromotableIntegerType(m_type);
+  return false;
 }
 
 bool CompilerType::IsPointerToVoid() const {
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index f7d634ffa2dec..8712142893835 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -123,6 +123,17 @@ CompilerType TypeSystem::GetTypeForFormatters(void *type) {
   return CompilerType(weak_from_this(), type);
 }
 
+bool TypeSystem::IsPromotableIntegerType(lldb::opaque_compiler_type_t type) {
+  return false;
+}
+
+llvm::Expected<CompilerType>
+TypeSystem::DoIntegralPromotion(CompilerType from,
+                                ExecutionContextScope *exe_scope) {
+  return llvm::createStringError(
+      "Integral promotion is not implemented for this TypeSystem");
+}
+
 bool TypeSystem::IsTemplateType(lldb::opaque_compiler_type_t type) {
   return false;
 }
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index a9dbfad298d05..40a05a467f883 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -21,6 +21,101 @@
 
 namespace lldb_private::dil {
 
+static llvm::Expected<lldb::TypeSystemSP>
+GetTypeSystemFromCU(std::shared_ptr<ExecutionContextScope> ctx) {
+  auto stack_frame = ctx->CalculateStackFrame();
+  if (!stack_frame)
+    return llvm::createStringError("no stack frame in this context");
+  SymbolContext symbol_context =
+      stack_frame->GetSymbolContext(lldb::eSymbolContextCompUnit);
+  lldb::LanguageType language = symbol_context.comp_unit->GetLanguage();
+
+  symbol_context = stack_frame->GetSymbolContext(lldb::eSymbolContextModule);
+  return symbol_context.module_sp->GetTypeSystemForLanguage(language);
+}
+
+static CompilerType GetBasicType(lldb::TypeSystemSP type_system,
+                                 lldb::BasicType basic_type) {
+  if (type_system)
+    return type_system.get()->GetBasicTypeFromAST(basic_type);
+
+  return CompilerType();
+}
+
+static lldb::ValueObjectSP
+ArrayToPointerConversion(ValueObject &valobj, ExecutionContextScope &ctx) {
+  uint64_t addr = valobj.GetLoadAddress();
+  ExecutionContext exe_ctx;
+  ctx.CalculateExecutionContext(exe_ctx);
+  return ValueObject::CreateValueObjectFromAddress(
+      "result", addr, exe_ctx,
+      valobj.GetCompilerType().GetArrayElementType(&ctx).GetPointerType(),
+      /* do_deref */ false);
+}
+
+llvm::Expected<lldb::ValueObjectSP>
+Interpreter::UnaryConversion(lldb::ValueObjectSP valobj, uint32_t location) {
+  if (!valobj)
+    return llvm::make_error<DILDiagnosticError>(m_expr, "invalid value object",
+                                                location);
+  llvm::Expected<lldb::TypeSystemSP> type_system =
+      GetTypeSystemFromCU(m_exe_ctx_scope);
+  if (!type_system)
+    return type_system.takeError();
+
+  CompilerType in_type = valobj->GetCompilerType();
+  if (valobj->IsBitfield()) {
+    // Promote bitfields. If `int` can represent the bitfield value, it is
+    // converted to `int`. Otherwise, if `unsigned int` can represent it, it
+    // is converted to `unsigned int`. Otherwise, it is treated as its
+    // underlying type.
+    uint32_t bitfield_size = valobj->GetBitfieldBitSize();
+    // Some bitfields have undefined size (e.g. result of ternary operation).
+    // The AST's `bitfield_size` of those is 0, and no promotion takes place.
+    if (bitfield_size > 0 && in_type.IsInteger()) {
+      CompilerType int_type = GetBasicType(*type_system, lldb::eBasicTypeInt);
+      CompilerType uint_type =
+          GetBasicType(*type_system, lldb::eBasicTypeUnsignedInt);
+      llvm::Expected<uint64_t> int_bit_size =
+          int_type.GetBitSize(m_exe_ctx_scope.get());
+      if (!int_bit_size)
+        return int_bit_size.takeError();
+      llvm::Expected<uint64_t> uint_bit_size =
+          uint_type.GetBitSize(m_exe_ctx_scope.get());
+      if (!uint_bit_size)
+        return int_bit_size.takeError();
+      if (bitfield_size < *int_bit_size ||
+          (in_type.IsSigned() && bitfield_size == *int_bit_size))
+        return valobj->CastToBasicType(int_type);
+      if (bitfield_size <= *uint_bit_size)
+        return valobj->CastToBasicType(uint_type);
+      // Re-create as a const value with the same underlying type
+      Scalar scalar;
+      bool resolved = valobj->ResolveValue(scalar);
+      if (!resolved)
+        return llvm::createStringError("invalid scalar value");
+      return ValueObject::CreateValueObjectFromScalar(m_target, scalar, in_type,
+                                                      "result");
+    }
+  }
+
+  if (in_type.IsArrayType())
+    valobj = ArrayToPointerConversion(*valobj, *m_exe_ctx_scope);
+
+  if (valobj->GetCompilerType().IsInteger() ||
+      valobj->GetCompilerType().IsUnscopedEnumerationType()) {
+    llvm::Expected<CompilerType> promoted_type =
+        type_system.get()->DoIntegralPromotion(valobj->GetCompilerType(),
+                                               m_exe_ctx_scope.get());
+    if (!promoted_type)
+      return promoted_type.takeError();
+    if (!promoted_type->CompareTypes(valobj->GetCompilerType()))
+      return valobj->CastToBasicType(*promoted_type);
+  }
+
+  return valobj;
+}
+
 static lldb::VariableSP DILFindVariable(ConstString name,
                                         VariableList &variable_list) {
   lldb::VariableSP exact_match;
@@ -147,6 +242,10 @@ Interpreter::Interpreter(lldb::TargetSP target, llvm::StringRef expr,
 llvm::Expected<lldb::ValueObjectSP> Interpreter::Evaluate(const ASTNode *node) {
   // Evaluate an AST.
   auto value_or_error = node->Accept(this);
+  // Convert SP with a nullptr to an error.
+  if (value_or_error && !*value_or_error)
+    return llvm::make_error<DILDiagnosticError>(m_expr, "invalid value object",
+                                                node->GetLocation());
   // Return the computed value-or-error. The caller is responsible for
   // checking if an error occured during the evaluation.
   return value_or_error;
@@ -175,21 +274,21 @@ Interpreter::Visit(const IdentifierNode *node) {
 llvm::Expected<lldb::ValueObjectSP>
 Interpreter::Visit(const UnaryOpNode *node) {
   Status error;
-  auto rhs_or_err = Evaluate(node->GetOperand());
-  if (!rhs_or_err)
-    return rhs_or_err;
+  auto op_or_err = Evaluate(node->GetOperand());
+  if (!op_or_err)
+    return op_or_err;
 
-  lldb::ValueObjectSP rhs = *rhs_or_err;
+  lldb::ValueObjectSP operand = *op_or_err;
 
   switch (node->GetKind()) {
   case UnaryOpKind::Deref: {
-    lldb::ValueObjectSP dynamic_rhs = rhs->GetDynamicValue(m_use_dynamic);
-    if (dynamic_rhs)
-      rhs = dynamic_rhs;
+    lldb::ValueObjectSP dynamic_op = operand->GetDynamicValue(m_use_dynamic);
+    if (dynamic_op)
+      operand = dynamic_op;
 
-    lldb::ValueObjectSP child_sp = rhs->Dereference(error);
+    lldb::ValueObjectSP child_sp = operand->Dereference(error);
     if (!child_sp && m_use_synthetic) {
-      if (lldb::ValueObjectSP synth_obj_sp = rhs->GetSyntheticValue()) {
+      if (lldb::ValueObjectSP synth_obj_sp = operand->GetSyntheticValue()) {
         error.Clear();
         child_sp = synth_obj_sp->Dereference(error);
       }
@@ -202,18 +301,69 @@ Interpreter::Visit(const UnaryOpNode *node) {
   }
   case UnaryOpKind::AddrOf: {
     Status error;
-    lldb::ValueObjectSP value = rhs->AddressOf(error);
+    lldb::ValueObjectSP value = operand->AddressOf(error);
     if (error.Fail())
       return llvm::make_error<DILDiagnosticError>(m_expr, error.AsCString(),
                                                   node->GetLocation());
 
     return value;
   }
+  case UnaryOpKind::Minus: {
+    if (operand->GetCompilerType().IsReferenceType()) {
+      operand = operand->Dereference(error);
+      if (error.Fail())
+        return error.ToError();
+    }
+    llvm::Expected<lldb::ValueObjectSP> conv_op =
+        UnaryConversion(operand, node->GetOperand()->GetLocation());
+    if (!conv_op)
+      return conv_op;
+    operand = *conv_op;
+    CompilerType operand_type = operand->GetCompilerType();
+    if (!operand_type.IsScalarType()) {
+      std::string errMsg =
+          llvm::formatv("invalid argument type '{0}' to unary expression",
+                        operand_type.GetTypeName());
+      return llvm::make_error<DILDiagnosticError>(m_expr, errMsg,
+                                                  node->GetLocation());
+    }
+    Scalar scalar;
+    bool resolved = operand->ResolveValue(scalar);
+    if (!resolved)
+      break;
+
+    bool negated = scalar.UnaryNegate();
+    if (negated)
+      return ValueObject::CreateValueObjectFromScalar(
+          m_target, scalar, operand->GetCompilerType(), "result");
+    break;
   }
-
-  // Unsupported/invalid operation.
-  return llvm::make_error<DILDiagnosticError>(
-      m_expr, "invalid ast: unexpected binary operator", node->GetLocation());
+  case UnaryOpKind::Plus: {
+    if (operand->GetCompilerType().IsReferenceType()) {
+      operand = operand->Dereference(error);
+      if (error.Fail())
+        return error.ToError();
+    }
+    llvm::Expected<lldb::ValueObjectSP> conv_op =
+        UnaryConversion(operand, node->GetOperand()->GetLocation());
+    if (!conv_op)
+      return conv_op;
+    operand = *conv_op;
+    CompilerType operand_type = operand->GetCompilerType();
+    if (!operand_type.IsScalarType() &&
+        // Unary plus is allowed for pointers.
+        !operand_type.IsPointerType()) {
+      std::string errMsg =
+          llvm::formatv("invalid argument type '{0}' to unary expression",
+                        operand_type.GetTypeName());
+      return llvm::make_error<DILDiagnosticError>(m_expr, errMsg,
+                                                  node->GetLocation());
+    }
+    return operand;
+  }
+  }
+  return llvm::make_error<DILDiagnosticError>(m_expr, "invalid unary operation",
+                                              node->GetLocation());
 }
 
 llvm::Expected<lldb::ValueObjectSP>
@@ -499,24 +649,6 @@ Interpreter::Visit(const BitFieldExtractionNode *node) {
   return child_valobj_sp;
 }
 
-static llvm::Expected<lldb::TypeSystemSP>
-GetTypeSystemFromCU(std::shared_ptr<StackFrame> ctx) {
-  SymbolContext symbol_context =
-      ctx->GetSymbolContext(lldb::eSymbolContextCompUnit);
-  lldb::LanguageType language = symbol_context.comp_unit->GetLanguage();
-
-  symbol_context = ctx->GetSymbolContext(lldb::eSymbolContextModule);
-  return symbol_context.module_sp->GetTypeSystemForLanguage(language);
-}
-
-static CompilerType GetBasicType(lldb::TypeSystemSP type_system,
-                                 lldb::BasicType basic_type) {
-  if (type_system)
-    return type_system.get()->GetBasicTypeFromAST(basic_type);
-
-  return CompilerType();
-}
-
 llvm::Expected<CompilerType>
 Interpreter::PickIntegerType(lldb::TypeSystemSP type_system,
                              std::shared_ptr<ExecutionContextScope> ctx,
diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp
index 82b97aafe2261..072ddff1e28d2 100644
--- a/lldb/source/ValueObject/DILParser.cpp
+++ b/lldb/source/ValueObject/DILParser.cpp
@@ -93,9 +93,12 @@ ASTNodeUP DILParser::ParseExpression() { return ParseUnaryExpression(); }
 //  unary_operator:
 //    "&"
 //    "*"
+//    "+"
+//    "-"
 //
 ASTNodeUP DILParser::ParseUnaryExpression() {
-  if (CurToken().IsOneOf({Token::amp, Token::star})) {
+  if (CurToken().IsOneOf(
+          {Token::amp, Token::star, Token::minus, Token::plus})) {
     Token token = CurToken();
     uint32_t loc = token.GetLocation();
     m_dil_lexer.Advance();
@@ -107,7 +110,12 @@ ASTNodeUP DILParser::ParseUnaryExpression() {
     case Token::amp:
       return std::make_unique<UnaryOpNode>(loc, UnaryOpKind::AddrOf,
                                            std::move(rhs));
-
+    case Token::minus:
+      return std::make_unique<UnaryOpNode>(loc, UnaryOpKind::Minus,
+                                           std::move(rhs));
+    case Token::plus:
+      return std::make_unique<UnaryOpNode>(loc, UnaryOpKind::Plus,
+                                           std::move(rhs));
     default:
       llvm_unreachable("invalid token kind");
     }
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
new file mode 100644
index 0000000000000..53a85fed303f4
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
@@ -0,0 +1,46 @@
+"""
+Test DIL arithmetic.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestFrameVarDILArithmetic(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_arithmetic(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp")
+        )
+
+        self.runCmd("settings set target.experimental.use-DIL true")
+
+        # Check unary results and integral promotion
+        self.expect_var_path("+0", value="0")
+        self.expect_var_path("-0", value="0")
+        self.expect_var_path("+1", value="1")
+        self.expect_var_path("-1", value="-1")
+        self.expect_var_path("-9223372036854775808", value="9223372036854775808")
+        self.expect_var_path("s", value="10", type="short")
+        self.expect_var_path("+s", value="10", type="int")
+        self.expect_var_path("-s", value="-10", type="int")
+        self.expect_var_path("+us", value="1", type="int")
+        self.expect_var_path("-us", value="-1", type="int")
+        self.expect_var_path("+ref", value="2", type="int")
+        self.expect_var_path("-ref", value="-2", type="int")
+        self.expect_var_path("+0.0", value="0")
+        self.expect_var_path("-0.0", value="-0")
+        self.expect_var_path("+enum_one", value="1")
+        self.expect_var_path("-enum_one", value="-1")
+        self.expect_var_path("+wchar", value="1")
+        self.expect_var_path("+char16", value="2")
+        self.expect_var_path("+char32", value="3")
+        self.expect_var_path("-bitfield.a", value="-1", type="int")
+        self.expect_var_path("+bitfield.a", value="1", type="int")
+        self.expect_var_path("+bitfield.b", value="2", type="int")
+        self.expect_var_path("+bitfield.c", value="3", type="unsigned int")
+        self.expect_var_path("+bitfield.d", value="4", type="uint64_t")
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp
new file mode 100644
index 0000000000000..2c70e93433f5f
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/main.cpp
@@ -0,0 +1,23 @@
+#include <cstdint>
+
+int main(int argc, char **argv) {
+  short s = 10;
+  unsigned short us = 1;
+
+  int x = 2;
+  int &ref = x;
+  enum Enum { kZero, kOne } enum_one = kOne;
+  wchar_t wchar = 1;
+  char16_t char16 = 2;
+  char32_t char32 = 3;
+
+  struct BitFieldStruct {
+    char a : 4;
+    int b : 32;
+    unsigned int c : 32;
+    uint64_t d : 48;
+  };
+  BitFieldStruct bitfield = {1, 2, 3, 4};
+
+  return 0; // Set a breakpoint here
+}
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py
new file mode 100644
index 0000000000000..88429b370710e
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILPointerArithmetic.py
@@ -0,0 +1,29 @@
+"""
+Test DIL pointer arithmetic.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestFrameVarDILPointerArithmetic(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_pointer_arithmetic(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp")
+        )
+
+        self.runCmd("settings set target.experimental.use-DIL true")
+
+        self.expect_var_path("+array", type="int *")
+        self.expect_var_path("+array_ref", type="int *")
+        self.expect_var_path("+p_int0", type="int *")
+        self.expect(
+            "frame var -- '-p_int0'",
+            error=True,
+            substrs=["invalid argument type 'int *' to unary expression"],
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
new file mode 100644
index 0000000000000..b4e0e88b1ffc9
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
@@ -0,0 +1,11 @@
+void stop() {}
+
+int main(int argc, char **argv) {
+  int array[10];
+  array[0] = 0;
+  int (&array_ref)[10] = array;
+  int *p_int0 = &array[0];
+
+  stop(); // Set a breakpoint here
+  return 0;
+}
diff --git a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
index 033c300ad6926..e28366e9f0432 100644
--- a/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
+++ b/lldb/unittests/UnwindAssembly/ARM64/TestArm64InstEmulation.cpp
@@ -964,3 +964,110 @@ TEST_F(TestArm64InstEmulation, TestPrologueStartsWithStrD8) {
     EXPECT_TRUE(regloc.IsSame());
   }
 }
+
+TEST_F(TestArm64InstEmulation, TestMidFunctionEpilogueAndBackwardsJump) {
+  ArchSpec arch("arm64-apple-ios15");
+  std::unique_ptr<UnwindAssemblyInstEmulation> engine(
+      static_cast<UnwindAssemblyInstEmulation *>(
+          UnwindAssemblyInstEmulation::CreateInstance(arch)));
+  ASSERT_NE(nullptr, engine);
+
+  const UnwindPlan::Row *row;
+  AddressRange sample_range;
+  UnwindPlan unwind_plan(eRegisterKindLLDB);
+  UnwindPlan::Row::AbstractRegisterLocation regloc;
+
+  // clang-format off
+  uint8_t data[] = {
+      0xff, 0xc3, 0x00, 0xd1, // <+0>:  sub    sp, sp, #0x30
+      0xfd, 0x7b, 0x02, 0xa9, // <+4>:  stp    x29, x30, [sp, #0x20]
+      0xfd, 0x83, 0x00, 0x91, // <+8>:  add    x29, sp, #0x20
+      0x1f, 0x04, 0x00, 0xf1, // <+12>: cmp    x0, #0x1
+      0x21, 0x01, 0x00, 0x54, // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+      0xfd, 0x7b, 0x42, 0xa9, // <+20>: ldp    x29, x30, [sp, #0x20]
+      0xff, 0xc3, 0x00, 0x91, // <+24>: add    sp, sp, #0x30
+      0xc0, 0x03, 0x5f, 0xd6, // <+28>: ret
+      // AFTER_EPILOGUE:  LLDB computes the next 5 unwind states incorrectly.
+      0x37, 0x00, 0x80, 0xd2, // <+32>: mov    x23, #0x1
+      0xf6, 0x5f, 0x41, 0xa9, // <+36>: ldp    x22, x23, [sp, #0x10]
+      0xfd, 0x7b, 0x42, 0xa9, // <+40>: ldp    x29, x30, [sp, #0x20]
+      0xff, 0xc3, 0x00, 0x91, // <+44>: add    sp, sp, #0x30
+      0xc0, 0x03, 0x5f, 0xd6, // <+48>: ret
+      // DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+      0xf6, 0x5f, 0x01, 0xa9, // <+52>: stp    x22, x23, [sp, #0x10]
+      0x36, 0x00, 0x80, 0xd2, // <+56>: mov    x22, #0x1
+      0x37, 0x00, 0x80, 0xd2, // <+60>: mov    x23, #0x1
+      0xf8, 0xff, 0xff, 0x17, // <+64>: b      ; <+32> AFTER_EPILOGUE
+  };
+
+  // UnwindPlan we expect:
+  // row[0]:    0: CFA=sp +0 =>
+  // row[1]:    4: CFA=sp+48 =>
+  // row[2]:    8: CFA=sp+16 => fp=[CFA-16] lr=[CFA-8]
+  // row[3]:   12: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8]
+  // row[4]:   24: CFA=sp+48 => fp=<same>   lr=<same>
+  //
+  // This must come from +56
+  // row[5]:   32: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8] x22=[CFA-24], x23=[CFA-32]
+  // row[6]:   40: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8] x22=same,     x23 = same
+  // row[6]:   44: CFA=sp+48 => fp=same     lr=same    x22=same,     x23 = same
+  // row[6]:   48: CFA=sp0   => fp=same     lr=same    x22=same,     x23 = same
+  //
+  // row[x]:   52: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8]
+  // row[x]:   56: CFA=fp+16 => fp=[CFA-16] lr=[CFA-8] x22=[CFA-24], x23=[CFA-32]
+  // clang-format on
+
+  sample_range = AddressRange(0x1000, sizeof(data));
+
+  EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
+      sample_range, data, sizeof(data), unwind_plan));
+
+  // At the end of prologue (+12), CFA = fp + 16.
+  // <+0>:  sub    sp, sp, #0x30
+  // <+4>:  stp    x29, x30, [sp, #0x20]
+  // <+8>:  add    x29, sp, #0x20
+  row = unwind_plan.GetRowForFunctionOffset(12);
+  EXPECT_EQ(12, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_EQ(row->GetCFAValue().GetRegisterNumber(), gpr_fp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 16);
+
+  // +16 and +20 are the same as +12.
+  // <+12>: cmp    x0, #0x1
+  // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+  EXPECT_EQ(12, unwind_plan.GetRowForFunctionOffset(16)->GetOffset());
+  EXPECT_EQ(12, unwind_plan.GetRowForFunctionOffset(20)->GetOffset());
+
+  // After restoring $fp to caller's value, CFA = $sp + 48
+  // <+20>: ldp    x29, x30, [sp, #0x20]
+  row = unwind_plan.GetRowForFunctionOffset(24);
+  EXPECT_EQ(24, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 48);
+
+  // $sp has been restored
+  // <+24>: add    sp, sp, #0x30
+  row = unwind_plan.GetRowForFunctionOffset(28);
+  EXPECT_EQ(28, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 0);
+
+  // FIXME: Row for offset +32 incorrectly inherits the state of the `ret`
+  // instruction, but +32 _never_ executes after the `ret`.
+  // <+28>: ret
+  // <+32>: mov    x23, #0x1
+  row = unwind_plan.GetRowForFunctionOffset(32);
+  // FIXME: EXPECT_NE(32, row->GetOffset());
+
+  // Check that the state of this branch
+  // <+16>: b.ne   ; <+52> DO_SOMETHING_AND_GOTO_AFTER_EPILOGUE
+  // was forwarded to the branch target:
+  // <+52>: stp    x22, x23, [sp, #0x10]
+  row = unwind_plan.GetRowForFunctionOffset(52);
+  EXPECT_EQ(52, row->GetOffset());
+  EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset());
+  EXPECT_EQ(row->GetCFAValue().GetRegisterNumber(), gpr_fp_arm64);
+  EXPECT_EQ(row->GetCFAValue().GetOffset(), 16);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 907f8300de6d2..396d64625fb5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -173,6 +173,14 @@ Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
   if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
     return RALSrc;
 
+  // RALSrc = G_ANYEXT S16Src
+  // TruncSrc = G_AMDGPU_READANYLANE RALSrc
+  // Src = G_TRUNC TruncSrc
+  if (mi_match(Src, MRI,
+               m_GTrunc(m_GAMDGPUReadAnyLane(m_GAnyExt(m_Reg(RALSrc)))))) {
+    return RALSrc;
+  }
+
   // TruncSrc = G_AMDGPU_READANYLANE RALSrc
   // AextSrc = G_TRUNC TruncSrc
   // Src = G_ANYEXT AextSrc
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 70564973816b1..e8fda829e2394 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -300,6 +300,12 @@ def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16i32 immAllZerosV))]>;
 def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
                [(set VR512:$dst, (v16i32 immAllOnesV))]>;
+let AddedComplexity = 1, Predicates = [HasVLX] in {
+  def AVX512_128_SETALLONES : I<0, Pseudo, (outs VR128X:$dst), (ins),
+                                "", [(set VR128X:$dst, (v4i32 immAllOnesV))]>;
+  def AVX512_256_SETALLONES : I<0, Pseudo, (outs VR256X:$dst), (ins),
+                                "", [(set VR256X:$dst, (v8i32 immAllOnesV))]>;
+}
 }
 
 let Predicates = [HasAVX512] in {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index cb0208a4a5f32..b988ae0aca912 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -778,6 +778,8 @@ bool X86InstrInfo::isReMaterializableImpl(
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
+  case X86::AVX512_128_SETALLONES:
+  case X86::AVX512_256_SETALLONES:
   case X86::AVX512_512_SETALLONES:
   case X86::AVX512_FsFLD0SD:
   case X86::AVX512_FsFLD0SH:
@@ -6246,9 +6248,31 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
     return true;
   }
+  case X86::AVX512_128_SETALLONES:
+  case X86::AVX512_256_SETALLONES:
   case X86::AVX512_512_SETALLONES: {
     Register Reg = MIB.getReg(0);
-    MIB->setDesc(get(X86::VPTERNLOGDZrri));
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case X86::AVX512_128_SETALLONES: {
+      if (X86::VR128RegClass.contains(Reg))
+        return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
+
+      Opc = X86::VPTERNLOGDZ128rri;
+      break;
+    }
+    case X86::AVX512_256_SETALLONES: {
+      if (X86::VR256RegClass.contains(Reg))
+        return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+
+      Opc = X86::VPTERNLOGDZ256rri;
+      break;
+    }
+    case X86::AVX512_512_SETALLONES:
+      Opc = X86::VPTERNLOGDZrri;
+      break;
+    }
+    MIB->setDesc(get(Opc));
     // VPTERNLOGD needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
     MIB.addReg(Reg, RegState::Undef)
@@ -8190,6 +8214,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
+    case X86::AVX512_256_SETALLONES:
       Alignment = Align(32);
       break;
     case X86::V_SET0:
@@ -8197,6 +8222,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     case X86::AVX512_128_SET0:
     case X86::FsFLD0F128:
     case X86::AVX512_FsFLD0F128:
+    case X86::AVX512_128_SETALLONES:
       Alignment = Align(16);
       break;
     case X86::MMX_SET0:
@@ -8255,6 +8281,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
+  case X86::AVX512_128_SETALLONES:
+  case X86::AVX512_256_SETALLONES:
   case X86::AVX512_512_SETALLONES:
   case X86::FsFLD0SH:
   case X86::AVX512_FsFLD0SH:
@@ -8315,6 +8343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       break;
     case X86::AVX1_SETALLONES:
     case X86::AVX2_SETALLONES:
+    case X86::AVX512_256_SETALLONES:
       IsAllOnes = true;
       [[fallthrough]];
     case X86::AVX512_256_SET0:
@@ -8328,6 +8357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
                                 2);
       break;
     case X86::V_SETALLONES:
+    case X86::AVX512_128_SETALLONES:
       IsAllOnes = true;
       [[fallthrough]];
     case X86::V_SET0:
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 8883a527c2226..4ac6cb247bd13 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -665,7 +665,10 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
     return;
 
   SmallVector<const BranchInst *, 4> BrInsts;
+  SmallPtrSet<const Instruction *, 16> Visited;
   auto Pred = [&](const Instruction *I) {
+    if (!Visited.insert(I).second)
+      return false;
     if (const BranchInst *Br = dyn_cast<BranchInst>(I))
       if (Br->isConditional())
         BrInsts.push_back(Br);
@@ -684,28 +687,10 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
   // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m}
   //
   // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m
-  //
-  // FIXME: Currently, recursive branches are not handled. For example, we
-  // can't deduce that ptr must be dereferenced in below function.
-  //
-  // void f(int a, int c, int *ptr) {
-  //    if(a)
-  //      if (b) {
-  //        *ptr = 0;
-  //      } else {
-  //        *ptr = 1;
-  //      }
-  //    else {
-  //      if (b) {
-  //        *ptr = 0;
-  //      } else {
-  //        *ptr = 1;
-  //      }
-  //    }
-  // }
 
   Explorer->checkForAllContext(&CtxI, Pred);
-  for (const BranchInst *Br : BrInsts) {
+  while (!BrInsts.empty()) {
+    const BranchInst *Br = BrInsts.pop_back_val();
     StateType ParentState;
 
     // The known state of the parent state is a conjunction of children's
@@ -714,15 +699,18 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
 
     for (const BasicBlock *BB : Br->successors()) {
       StateType ChildState;
-
       size_t BeforeSize = Uses.size();
-      followUsesInContext(AA, A, *Explorer, &BB->front(), Uses, ChildState);
+      const Instruction *I = &BB->front();
+      followUsesInContext(AA, A, *Explorer, I, Uses, ChildState);
 
       // Erase uses which only appear in the child.
       for (auto It = Uses.begin() + BeforeSize; It != Uses.end();)
         It = Uses.erase(It);
 
       ParentState &= ChildState;
+
+      // Check for recursive conditional branches.
+      Explorer->checkForAllContext(I, Pred);
     }
 
     // Use only known state.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 405f83a6ce8e5..8a435accfedfe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1096,6 +1096,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // Calculates the first active lane index of the vector predicate operands.
     // It produces the lane index across all unrolled iterations. Unrolling will
     // add all copies of its original operand as additional operands.
+    // Implemented with @llvm.experimental.cttz.elts, but returns the expected
+    // result even with operands that are all zeroes.
     FirstActiveLane,
 
     // The opcodes below are used for VPInstructionWithType.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b27f2f8a3c8cb..5ea9dd349e06f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1015,7 +1015,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     if (getNumOperands() == 1) {
       Value *Mask = State.get(getOperand(0));
       return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
-                                                  true, Name);
+                                                  /*ZeroIsPoison=*/false, Name);
     }
     // If there are multiple operands, create a chain of selects to pick the
     // first operand with an active lane and add the number of lanes of the
@@ -1031,9 +1031,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                     Builder.CreateICmpEQ(State.get(getOperand(Idx)),
                                          Builder.getFalse()),
                     Builder.getInt64Ty())
-              : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
-                                                     State.get(getOperand(Idx)),
-                                                     true, Name);
+              : Builder.CreateCountTrailingZeroElems(
+                    Builder.getInt64Ty(), State.get(getOperand(Idx)),
+                    /*ZeroIsPoison=*/false, Name);
       Value *Current = Builder.CreateAdd(
           Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
       if (Res) {
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 9d06b4bdec9b4..cd6068d382169 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -5,277 +5,544 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i8 @fshl_i8_3rd_arg_const(i8 %a, i8 %b) {
 ; CHECK-LABEL: 'fshl_i8_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
-  ret i8 %fshl
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 9)
+  ret i8 %r
 }
 
 define i8 @fshl_i8_3rd_arg_var(i8 %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: 'fshl_i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
-  ret i8 %fshl
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
+  ret i8 %r
 }
 
-declare i8 @llvm.fshl.i8(i8, i8, i8)
-
-define i16 @fshl_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: 'fshl_i16'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %fshl
+define i16 @fshl_i16_3rd_arg_const(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fshl_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 entry:
-  %fshl = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
-  ret i16 %fshl
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 9)
+  ret i16 %r
 }
 
-declare i16 @llvm.fshl.i16(i16, i16, i16)
+define i16 @fshl_i16_3rd_arg_var(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: 'fshl_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %r
+}
 
 define i32 @fshl_i32_3rd_arg_const(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'fshl_i32_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
-  ret i32 %fshl
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 9)
+  ret i32 %r
 }
 
 define i32 @fshl_i32_3rd_arg_var(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: 'fshl_i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-  ret i32 %fshl
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %r
 }
 
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-
 define i64 @fshl_i64_3rd_arg_const(i64 %a, i64 %b) {
 ; CHECK-LABEL: 'fshl_i64_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
-  ret i64 %fshl
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 9)
+  ret i64 %r
 }
 
 define i64 @fshl_i64_3rd_arg_var(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: 'fshl_i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %r
+}
+
+define i128 @fshl_i128_3rd_arg_const(i128 %a, i128 %b) {
+; CHECK-LABEL: 'fshl_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
-  ret i64 %fshl
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
+  ret i128 %r
 }
 
-declare i64 @llvm.fshl.i64(i64, i64, i64)
+define i128 @fshl_i128_3rd_arg_var(i128 %a, i128 %b, i128 %c) {
+; CHECK-LABEL: 'fshl_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:9 SizeLat:9 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 %c)
+  ret i128 %r
+}
 
 define i19 @fshl_i19(i19 %a, i19 %b) {
 ; CHECK-LABEL: 'fshl_i19'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %r
 ;
 entry:
-  %fshl = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
-  ret i19 %fshl
+  %r = tail call i19 @llvm.fshl.i19(i19 %a, i19 %b, i19 9)
+  ret i19 %r
 }
 
-declare i19 @llvm.fshl.i19(i19, i19, i19)
+define i66 @fshl_i66(i66 %a, i66 %b) {
+; CHECK-LABEL: 'fshl_i66'
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %r = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %r
+;
+entry:
+  %r = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
+  ret i66 %r
+}
 
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %fshl
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-  ret <16 x i8> %fshl
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %fshl
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %r
 }
 
-declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %fshl
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-  ret <8 x i16> %fshl
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-  ret <8 x i16> %fshl
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+  ret <8 x i16> %r
 }
 
-declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %fshl
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-  ret <4 x i32> %fshl
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-  ret <4 x i32> %fshl
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %r
 }
 
-declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshl = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
-  ret <2 x i64> %fshl
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshl = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-  ret <2 x i64> %fshl
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshl = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-  ret <2 x i64> %fshl
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %r
 }
 
-declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
 define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %r = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %r
 ;
 entry:
-  %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-  ret <4 x i30> %fshl
+  %r = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+  ret <4 x i30> %r
 }
 
-declare <4 x i30> @llvm.fshl.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
-
 define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %r
 ;
 entry:
-  %fshl = tail call <2 x i66> @llvm.fshl.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-  ret <2 x i66> %fshl
+  %r = tail call <2 x i66> @llvm.fshl.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+  ret <2 x i66> %r
 }
-declare <2 x i66> @llvm.fshl.v4i66(<2 x i66>, <2 x i66>, <2 x i66>)
 
-define i66 @fshl_i66(i66 %a, i66 %b) {
-; CHECK-LABEL: 'fshl_i66'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %fshl = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %fshl
+define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a, <2 x i128> %b) {
+; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
 ;
 entry:
-  %fshl = tail call i66 @llvm.fshl.i66(i66 %a, i66 %b, i66 9)
-  ret i66 %fshl
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
 }
 
-declare i66 @llvm.fshl.i66(i66, i66, i66)
-
 define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %fshl
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @fshl_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
+; CHECK-LABEL: 'fshl_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:17 Lat:21 SizeLat:21 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+  ret <2 x i128> %r
+}
+
+
+; Rotate tests
+
+define i8 @rotl_i8_3rd_arg_const(i8 %a) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 9)
+  ret i8 %r
+}
+
+define i8 @rotl_i8_3rd_arg_var(i8 %a, i8 %c) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshl.i8(i8 %a, i8 %a, i8 %c)
+  ret i8 %r
+}
+
+define i16 @rotl_i16_3rd_arg_const(i16 %a) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 9)
+  ret i16 %r
+}
+
+define i16 @rotl_i16_3rd_arg_var(i16 %a, i16 %c) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %c)
+  ret i16 %r
+}
+
+define i32 @rotl_i32_3rd_arg_const(i32 %a) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
+  ret i32 %r
+}
+
+define i32 @rotl_i32_3rd_arg_var(i32 %a, i32 %c) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %c)
+  ret i32 %r
+}
+
+define i64 @rotl_i64_3rd_arg_const(i64 %a) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+  ret i64 %r
+}
+
+define i64 @rotl_i64_3rd_arg_var(i64 %a, i64 %c) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshl = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-  ret <2 x i128> %fshl
+  %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %c)
+  ret i64 %r
 }
-declare <2 x i128> @llvm.fshl.v4i128(<2 x i128>, <2 x i128>, <2 x i128>)
 
-define i128 @fshl_i128(i128 %a, i128 %b) {
-; CHECK-LABEL: 'fshl_i128'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %fshl = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %fshl
+define i128 @rotl_i128_3rd_arg_const(i128 %a) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshl = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
-  ret i128 %fshl
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 9)
+  ret i128 %r
 }
 
-declare i128 @llvm.fshl.i128(i128, i128, i128)
+define i128 @rotl_i128_3rd_arg_var(i128 %a, i128 %c) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 %c)
+  ret i128 %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %c) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %c) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %c) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %c) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshl.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+  ret <2 x i64> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %c) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshl.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+  ret <2 x i128> %r
+}
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index b31806b647868..795371e9f3f68 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -5,277 +5,544 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define i8 @fshr_i8_3rd_arg_const(i8 %a, i8 %b) {
 ; CHECK-LABEL: 'fshr_i8_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
-  ret i8 %fshr
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 9)
+  ret i8 %r
 }
 
 define i8 @fshr_i8_3rd_arg_var(i8 %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: 'fshr_i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
 ;
 entry:
-  %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
-  ret i8 %fshr
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
+  ret i8 %r
 }
 
-declare i8 @llvm.fshr.i8(i8, i8, i8)
-
-define i16 @fshr_i16(i16 %a, i16 %b) {
-; CHECK-LABEL: 'fshr_i16'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %fshr
+define i16 @fshr_i16_3rd_arg_const(i16 %a, i16 %b) {
+; CHECK-LABEL: 'fshr_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
 ;
 entry:
-  %fshr = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
-  ret i16 %fshr
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 9)
+  ret i16 %r
 }
 
-declare i16 @llvm.fshr.i16(i16, i16, i16)
+define i16 @fshr_i16_3rd_arg_var(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: 'fshr_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %c)
+  ret i16 %r
+}
 
 define i32 @fshr_i32_3rd_arg_const(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'fshr_i32_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
-  ret i32 %fshr
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 9)
+  ret i32 %r
 }
 
 define i32 @fshr_i32_3rd_arg_var(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: 'fshr_i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
 ;
 entry:
-  %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
-  ret i32 %fshr
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %r
 }
 
-declare i32 @llvm.fshr.i32(i32, i32, i32)
-
 define i64 @fshr_i64_3rd_arg_const(i64 %a, i64 %b) {
 ; CHECK-LABEL: 'fshr_i64_3rd_arg_const'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
-  ret i64 %fshr
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 9)
+  ret i64 %r
 }
 
 define i64 @fshr_i64_3rd_arg_var(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: 'fshr_i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %r
+}
+
+define i128 @fshr_i128_3rd_arg_const(i128 %a, i128 %b) {
+; CHECK-LABEL: 'fshr_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
-  ret i64 %fshr
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
+  ret i128 %r
 }
 
-declare i64 @llvm.fshr.i64(i64, i64, i64)
+define i128 @fshr_i128_3rd_arg_var(i128 %a, i128 %b, i128 %c) {
+; CHECK-LABEL: 'fshr_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:9 Lat:9 SizeLat:9 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 %c)
+  ret i128 %r
+}
 
 define i19 @fshr_i19(i19 %a, i19 %b) {
 ; CHECK-LABEL: 'fshr_i19'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i19 %r
 ;
 entry:
-  %fshr = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
-  ret i19 %fshr
+  %r = tail call i19 @llvm.fshr.i19(i19 %a, i19 %b, i19 9)
+  ret i19 %r
 }
 
-declare i19 @llvm.fshr.i19(i19, i19, i19)
+define i66 @fshr_i66(i66 %a, i66 %b) {
+; CHECK-LABEL: 'fshr_i66'
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %r = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %r
+;
+entry:
+  %r = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
+  ret i66 %r
+}
 
 
 define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-  ret <16 x i8> %fshr
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
-  ret <16 x i8> %fshr
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
 }
 
 define <16 x i8> @fshr_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
 ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
 ;
 entry:
-  %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
-  ret <16 x i8> %fshr
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %r
 }
 
-declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
 define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-  ret <8 x i16> %fshr
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
-  ret <8 x i16> %fshr
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
 }
 
 define <8 x i16> @fshr_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
 ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
 ;
 entry:
-  %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
-  ret <8 x i16> %fshr
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
+  ret <8 x i16> %r
 }
 
-declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
 define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
-  ret <4 x i32> %fshr
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
-  ret <4 x i32> %fshr
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
 }
 
 define <4 x i32> @fshr_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
 ;
 entry:
-  %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
-  ret <4 x i32> %fshr
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %r
 }
 
-declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
 define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshr = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
-  ret <2 x i64> %fshr
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshr = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
-  ret <2 x i64> %fshr
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
 }
 
 define <2 x i64> @fshr_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of 7 for: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of 7 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
 ;
 entry:
-  %fshr = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
-  ret <2 x i64> %fshr
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
+  ret <2 x i64> %r
 }
 
-declare <2 x i64> @llvm.fshr.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
 define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
 ; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:10 Lat:10 SizeLat:10 for: %r = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i30> %r
 ;
 entry:
-  %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
-  ret <4 x i30> %fshr
+  %r = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
+  ret <4 x i30> %r
 }
 
-declare <4 x i30> @llvm.fshr.v4i30(<4 x i30>, <4 x i30>, <4 x i30>)
-
 define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
 ; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %r
 ;
 entry:
-  %fshr = tail call <2 x i66> @llvm.fshr.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
-  ret <2 x i66> %fshr
+  %r = tail call <2 x i66> @llvm.fshr.v4i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+  ret <2 x i66> %r
 }
-declare <2 x i66> @llvm.fshr.v4i66(<2 x i66>, <2 x i66>, <2 x i66>)
 
-define i66 @fshr_i66(i66 %a, i66 %b) {
-; CHECK-LABEL: 'fshr_i66'
-; CHECK-NEXT:  Cost Model: Found costs of 3 for: %fshr = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i66 %fshr
+define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a, <2 x i128> %b) {
+; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
 ;
 entry:
-  %fshr = tail call i66 @llvm.fshr.i66(i66 %a, i66 %b, i66 9)
-  ret i66 %fshr
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
 }
 
-declare i66 @llvm.fshr.i66(i66, i66, i66)
-
 define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
 ; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %fshr
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @fshr_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
+; CHECK-LABEL: 'fshr_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:17 Lat:21 SizeLat:21 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
+  ret <2 x i128> %r
+}
+
+
+; Rotate tests
+
+define i8 @rotl_i8_3rd_arg_const(i8 %a) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 9)
+  ret i8 %r
+}
+
+define i8 @rotl_i8_3rd_arg_var(i8 %a, i8 %c) {
+; CHECK-LABEL: 'rotl_i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %r
+;
+entry:
+  %r = tail call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 %c)
+  ret i8 %r
+}
+
+define i16 @rotl_i16_3rd_arg_const(i16 %a) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 9)
+  ret i16 %r
+}
+
+define i16 @rotl_i16_3rd_arg_var(i16 %a, i16 %c) {
+; CHECK-LABEL: 'rotl_i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %r
+;
+entry:
+  %r = tail call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 %c)
+  ret i16 %r
+}
+
+define i32 @rotl_i32_3rd_arg_const(i32 %a) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
+  ret i32 %r
+}
+
+define i32 @rotl_i32_3rd_arg_var(i32 %a, i32 %c) {
+; CHECK-LABEL: 'rotl_i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
+;
+entry:
+  %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %c)
+  ret i32 %r
+}
+
+define i64 @rotl_i64_3rd_arg_const(i64 %a) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
+;
+entry:
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+  ret i64 %r
+}
+
+define i64 @rotl_i64_3rd_arg_var(i64 %a, i64 %c) {
+; CHECK-LABEL: 'rotl_i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
 ;
 entry:
-  %fshr = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
-  ret <2 x i128> %fshr
+  %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %c)
+  ret i64 %r
 }
-declare <2 x i128> @llvm.fshr.v4i128(<2 x i128>, <2 x i128>, <2 x i128>)
 
-define i128 @fshr_i128(i128 %a, i128 %b) {
-; CHECK-LABEL: 'fshr_i128'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %fshr = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %fshr
+define i128 @rotl_i128_3rd_arg_const(i128 %a) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_const'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 9)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
 ;
 entry:
-  %fshr = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
-  ret i128 %fshr
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 9)
+  ret i128 %r
 }
 
-declare i128 @llvm.fshr.i128(i128, i128, i128)
+define i128 @rotl_i128_3rd_arg_var(i128 %a, i128 %c) {
+; CHECK-LABEL: 'rotl_i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
+;
+entry:
+  %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %c)
+  ret i128 %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> splat (i8 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @rotl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %c) {
+; CHECK-LABEL: 'rotl_v16i8_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
+;
+entry:
+  %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> %c)
+  ret <16 x i8> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> splat (i16 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @rotl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %c) {
+; CHECK-LABEL: 'rotl_v8i16_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
+;
+entry:
+  %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> %c)
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> splat (i32 3))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @rotl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %c) {
+; CHECK-LABEL: 'rotl_v4i32_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
+;
+entry:
+  %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> %c)
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> splat (i64 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 1>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @rotl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %c) {
+; CHECK-LABEL: 'rotl_v2i64_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of 5 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
+;
+entry:
+  %r = tail call <2 x i64> @llvm.fshr.v4i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> %c)
+  ret <2 x i64> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_all_lanes_same'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 1>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_lanes_different'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+  ret <2 x i128> %r
+}
+
+define <2 x i128> @rotl_v2i128_3rd_arg_var(<2 x i128> %a, <2 x i128> %c) {
+; CHECK-LABEL: 'rotl_v2i128_3rd_arg_var'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:5 Lat:5 SizeLat:5 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
+;
+entry:
+  %r = tail call <2 x i128> @llvm.fshr.v4i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> %c)
+  ret <2 x i128> %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
index 4361e5c113708..27005e7aa175e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform-in-vgpr.ll
@@ -1070,9 +1070,6 @@ define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr a
 ; GFX11-True16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-True16-NEXT:    ds_load_u16_d16 v1, v1
 ; GFX11-True16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-True16-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX11-True16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-True16-NEXT:    v_mov_b16_e32 v1.l, s0
 ; GFX11-True16-NEXT:    ds_store_b16 v0, v1
 ; GFX11-True16-NEXT:    s_endpgm
 ;
@@ -1089,10 +1086,6 @@ define amdgpu_ps void @load_divergent_P3_i16(ptr addrspace(3) inreg %ptra, ptr a
 ; GFX12-True16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-True16-NEXT:    ds_load_u16_d16 v1, v1
 ; GFX12-True16-NEXT:    s_wait_dscnt 0x0
-; GFX12-True16-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX12-True16-NEXT:    s_wait_alu 0xf1ff
-; GFX12-True16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-True16-NEXT:    v_mov_b16_e32 v1.l, s0
 ; GFX12-True16-NEXT:    ds_store_b16 v0, v1
 ; GFX12-True16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
index bf36deac33380..9bf140cf744db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-uniform.ll
@@ -13,9 +13,6 @@ define amdgpu_ps void @load_uniform_P1_i16_gfx12(ptr addrspace(1) inreg %ptra, p
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_load_d16_b16 v2, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b16_e32 v2.l, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -312,9 +309,6 @@ define amdgpu_ps void @load_uniform_P4_i16_gfx12(ptr addrspace(4) inreg %ptra, p
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_load_d16_b16 v2, v2, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b16_e32 v2.l, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/SPIRV/semantics/position.ps.ll b/llvm/test/CodeGen/SPIRV/semantics/position.ps.ll
new file mode 100644
index 0000000000000..2c02987f73928
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/semantics/position.ps.ll
@@ -0,0 +1,32 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        OpDecorate %[[#INPUT:]] BuiltIn FragCoord
+; CHECK-DAG:        OpDecorate %[[#OUTPUT:]] Location 0
+
+; CHECK-DAG:   %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG:      %[[#v4:]] = OpTypeVector %[[#float]] 4
+; CHECK-DAG:   %[[#ptr_i:]] = OpTypePointer Input %[[#v4]]
+; CHECK-DAG:   %[[#ptr_o:]] = OpTypePointer Output %[[#v4]]
+
+; CHECK-DAG:      %[[#INPUT]] = OpVariable %[[#ptr_i]] Input
+; CHECK-DAG:      %[[#OUTPUT]] = OpVariable %[[#ptr_o]] Output
+
+@SV_Position = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+@A0 = external hidden thread_local addrspace(8) global <4 x float>, !spirv.Decorations !2
+
+define void @main() #1 {
+entry:
+  %0 = load <4 x float>, ptr addrspace(7) @SV_Position, align 16
+  store <4 x float> %0, ptr addrspace(8) @A0, align 16
+  ret void
+
+; CHECK: %[[#TMP:]] = OpLoad %[[#v4]] %[[#INPUT]] Aligned 16
+; CHECK:              OpStore %[[#OUTPUT]] %[[#TMP]] Aligned 16
+}
+
+!0 = !{!1}
+!1 = !{i32 11, i32 15}
+!2 = !{!3}
+!3 = !{i32 30, i32 0}
+
diff --git a/llvm/test/CodeGen/SPIRV/semantics/position.vs.ll b/llvm/test/CodeGen/SPIRV/semantics/position.vs.ll
new file mode 100644
index 0000000000000..73165f3719a97
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/semantics/position.vs.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        OpDecorate %[[#INPUT:]] Location 0
+; CHECK-DAG:        OpDecorate %[[#OUTPUT:]] BuiltIn Position
+
+; CHECK-DAG:   %[[#float:]] = OpTypeFloat 32
+; CHECK-DAG:      %[[#v4:]] = OpTypeVector %[[#float]] 4
+; CHECK-DAG:   %[[#ptr_i:]] = OpTypePointer Input %[[#v4]]
+; CHECK-DAG:   %[[#ptr_o:]] = OpTypePointer Output %[[#v4]]
+
+; CHECK-DAG:      %[[#INPUT]] = OpVariable %[[#ptr_i]] Input
+; CHECK-DAG:      %[[#OUTPUT]] = OpVariable %[[#ptr_o]] Output
+
+@SV_Position0 = external hidden thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+@SV_Position = external hidden thread_local addrspace(8) global <4 x float>, !spirv.Decorations !2
+
+define void @main() #1 {
+entry:
+  %0 = load <4 x float>, ptr addrspace(7) @SV_Position0, align 16
+  store <4 x float> %0, ptr addrspace(8) @SV_Position, align 16
+  ret void
+
+; CHECK: %[[#TMP:]] = OpLoad %[[#v4]] %[[#INPUT]] Aligned 16
+; CHECK:              OpStore %[[#OUTPUT]] %[[#TMP]] Aligned 16
+}
+
+!0 = !{!1}
+!1 = !{i32 30, i32 0}
+!2 = !{!3}
+!3 = !{i32 11, i32 0}
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
index 2b34739fa80e3..6f31aef9aee98 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr  | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) {
 ; CHECK-LABEL: map0:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %r16
-; CHECK-NEXT:    movq %rdi, %r17
+; CHECK-NEXT:    movq %rsi, %r16 # encoding: [0xd5,0x18,0x89,0xf0]
+; CHECK-NEXT:    movq %rdi, %r17 # encoding: [0xd5,0x18,0x89,0xf9]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl (%r17,%r16,4), %eax
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl (%r17,%r16,4), %eax # encoding: [0xd5,0x30,0x8b,0x04,0x81]
+; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
@@ -22,23 +22,23 @@ entry:
 define i32 @map1_or_vex(<2 x double> noundef %a) nounwind {
 ; SSE-LABEL: map1_or_vex:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    cvtsd2si %xmm0, %r16d
+; SSE-NEXT:    cvtsd2si %xmm0, %r16d # encoding: [0xf2,0xd5,0xc0,0x2d,0xc0]
 ; SSE-NEXT:    #APP
-; SSE-NEXT:    nop
+; SSE-NEXT:    nop # encoding: [0x90]
 ; SSE-NEXT:    #NO_APP
-; SSE-NEXT:    movl %r16d, %eax
-; SSE-NEXT:    retq
+; SSE-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; AVX-LABEL: map1_or_vex:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    vcvtsd2si %xmm0, %ebx
+; AVX-NEXT:    pushq %rbx # encoding: [0x53]
+; AVX-NEXT:    vcvtsd2si %xmm0, %ebx # encoding: [0xc5,0xfb,0x2d,0xd8]
 ; AVX-NEXT:    #APP
-; AVX-NEXT:    nop
+; AVX-NEXT:    nop # encoding: [0x90]
 ; AVX-NEXT:    #NO_APP
-; AVX-NEXT:    movl %ebx, %eax
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    retq
+; AVX-NEXT:    movl %ebx, %eax # encoding: [0x89,0xd8]
+; AVX-NEXT:    popq %rbx # encoding: [0x5b]
+; AVX-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a)
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
@@ -48,31 +48,31 @@ entry:
 define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind {
 ; SSE-LABEL: map2_or_vex:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %rsi, %rbx
-; SSE-NEXT:    movq %rdi, %r14
+; SSE-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; SSE-NEXT:    pushq %rbx # encoding: [0x53]
+; SSE-NEXT:    movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; SSE-NEXT:    movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
 ; SSE-NEXT:    #APP
-; SSE-NEXT:    nop
+; SSE-NEXT:    nop # encoding: [0x90]
 ; SSE-NEXT:    #NO_APP
-; SSE-NEXT:    pabsb (%r14,%rbx,4), %xmm0
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    retq
+; SSE-NEXT:    pabsb (%r14,%rbx,4), %xmm0 # encoding: [0x66,0x41,0x0f,0x38,0x1c,0x04,0x9e]
+; SSE-NEXT:    popq %rbx # encoding: [0x5b]
+; SSE-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; AVX-LABEL: map2_or_vex:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movq %rsi, %rbx
-; AVX-NEXT:    movq %rdi, %r14
+; AVX-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; AVX-NEXT:    pushq %rbx # encoding: [0x53]
+; AVX-NEXT:    movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; AVX-NEXT:    movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
 ; AVX-NEXT:    #APP
-; AVX-NEXT:    nop
+; AVX-NEXT:    nop # encoding: [0x90]
 ; AVX-NEXT:    #NO_APP
-; AVX-NEXT:    vpabsb (%r14,%rbx,4), %xmm0
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r14
-; AVX-NEXT:    retq
+; AVX-NEXT:    vpabsb (%r14,%rbx,4), %xmm0 # encoding: [0xc4,0xc2,0x79,0x1c,0x04,0x9e]
+; AVX-NEXT:    popq %rbx # encoding: [0x5b]
+; AVX-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; AVX-NEXT:    retq # encoding: [0xc3]
 entry:
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
index c193680607f76..a6ab98f8bf03e 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr --show-mc-encoding | FileCheck %s
 
 define dso_local void @amx(ptr noundef %data) nounwind {
 ; CHECK-LABEL: amx:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl $8, %eax
-; CHECK-NEXT:    tileloadd (%rbx,%rax), %tmm4
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl $8, %eax # encoding: [0xb8,0x08,0x00,0x00,0x00]
+; CHECK-NEXT:    tileloadd (%rbx,%rax), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x24,0x03]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8)
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
index 4692a58d095a6..e7bc0c362cad3 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr --show-mc-encoding | FileCheck %s
 
 define void @x87(ptr %0, ptr %1) nounwind {
 ; CHECK-LABEL: x87:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; CHECK-NEXT:    movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    flds (%r14)
-; CHECK-NEXT:    fstps (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    flds (%r14) # encoding: [0x41,0xd9,0x06]
+; CHECK-NEXT:    fstps (%rbx) # encoding: [0xd9,0x1b]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   %3 = load float, ptr %0
   store float %3, ptr %1
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
index f2025b5c8cbf8..9b89bce283b15 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr --show-mc-encoding | FileCheck %s
 
 define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xsave:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xsave (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xsave (%rbx) # encoding: [0x0f,0xae,0x23]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
@@ -24,17 +24,17 @@ declare void @llvm.x86.xsave(ptr, i32, i32)
 define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xsave64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xsave64 (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xsave64 (%rbx) # encoding: [0x48,0x0f,0xae,0x23]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
@@ -44,17 +44,17 @@ declare void @llvm.x86.xsave64(ptr, i32, i32)
 define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xrstor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xrstor (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xrstor (%rbx) # encoding: [0x0f,0xae,0x2b]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
@@ -64,17 +64,17 @@ declare void @llvm.x86.xrstor(ptr, i32, i32)
 define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
 ; CHECK-LABEL: test_xrstor64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    movl %edx, %r16d
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK-NEXT:    movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT:    movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
+; CHECK-NEXT:    nop # encoding: [0x90]
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movl %r16d, %eax
-; CHECK-NEXT:    xrstor64 (%rbx)
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT:    xrstor64 (%rbx) # encoding: [0x48,0x0f,0xae,0x2b]
+; CHECK-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
   call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo)
   ret void;
diff --git a/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
new file mode 100644
index 0000000000000..0d8f2177aaa30
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=i386-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+  target triple = "i386-unknown-unknown"
+
+  define void @setallones() #0 {
+  ; CHECK-LABEL: setallones:
+  ; CHECK:       # %bb.0:
+  ; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+  ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+  entry:
+  unreachable
+  }
+
+  attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name:            setallones
+tracksRegLiveness: true
+liveins: []
+body:             |
+  bb.0:
+    $xmm0 = AVX512_128_SETALLONES
+    $ymm1 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
new file mode 100644
index 0000000000000..7e5ddc4cd632f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=x86_64-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+  target triple = "x86_64-unknown-unknown"
+
+  define void @setallones() #0 {
+  ; CHECK-LABEL: setallones:
+  ; CHECK:       # %bb.0:
+  ; CHECK-NEXT:    vpcmpeqd %xmm14, %xmm14, %xmm14
+  ; CHECK-NEXT:    vpternlogd {{.*#+}} xmm16 = -1
+  ; CHECK-NEXT:    vpcmpeqd %ymm15, %ymm15, %ymm15
+  ; CHECK-NEXT:    vpternlogd {{.*#+}} ymm17 = -1
+  entry:
+  unreachable
+  }
+
+  attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name:            setallones
+tracksRegLiveness: true
+liveins: []
+body:             |
+  bb.0:
+    $xmm14 = AVX512_128_SETALLONES
+    $xmm16 = AVX512_128_SETALLONES
+    $ymm15 = AVX512_256_SETALLONES
+    $ymm17 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
index 3243d950740ca..e2400fbe2c4ff 100644
--- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
+++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
@@ -106,7 +106,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    subq $24, %rsp
 ; AVX512-NEXT:    .cfi_def_cfa_offset 32
-; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    callq use.v4.i32@PLT
 ; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index f1438532d7c5e..5b4689b2954df 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -1,8 +1,195 @@
-// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --unique --version 5
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
-v_bfrev_b32_e32 v5, v1
+//  INSTS=
+//      v_bfrev_b32 OPS32
+//      v_ceil_f16 OPS16
+//      v_ceil_f32 OPS32
+//      v_ceil_f64 OPS64
+//      v_cls_i32 OPS32
+//      v_clz_i32_u32 OPS32
+//      v_cos_f16 OPS16
+//      v_cos_f32 OPS32
+//      v_ctz_i32_b32 OPS32
+//      v_cvt_f16_f32 v5.l, SRC32
+//      v_cvt_f16_f32 v127.h, 0xaf123456
+//      v_cvt_f16_f32 v127.l, 0.5
+//      v_cvt_f16_i16 OPS16
+//      v_cvt_f16_u16 OPS16
+//      v_cvt_f32_f16 OPS_32_16
+//      v_cvt_f32_f64 OPS_32_64
+//      v_cvt_f32_i32 OPS32
+//      v_cvt_f32_u32 OPS32
+//      v_cvt_f32_ubyte0 OPS32
+//      v_cvt_f32_ubyte1 OPS32
+//      v_cvt_f32_ubyte2 OPS32
+//      v_cvt_f32_ubyte3 OPS32
+//      v_cvt_f64_f32 OPS_64_32
+//      v_cvt_f64_i32 OPS_64_32
+//      v_cvt_f64_u32 OPS_64_32
+//      v_cvt_floor_i32_f32 OPS32
+//      v_cvt_flr_i32_f32 OPS32
+//      v_cvt_i16_f16 OPS16
+//      v_cvt_i32_f32 OPS32
+//      v_cvt_i32_f64 OPS_32_64
+//      v_cvt_i32_i16 OPS_32_16
+//      v_cvt_nearest_i32_f32 OPS32
+//      v_cvt_norm_i16_f16 OPS16
+//      v_cvt_norm_u16_f16 OPS16
+//      v_cvt_off_f32_i4 v5, SRC32
+//      v_cvt_off_f32_i4 v255, 0x4f
+//      v_cvt_rpi_i32_f32 OPS32
+//      v_cvt_u16_f16 OPS16
+//      v_cvt_u32_f32 OPS32
+//      v_cvt_u32_f64 OPS_32_64
+//      v_cvt_u32_u16 OPS_32_16
+//      v_exp_f16 OPS16
+//      v_exp_f32 OPS32
+//      v_ffbh_i32 OPS32
+//      v_ffbh_u32 OPS32
+//      v_ffbl_b32 OPS32
+//      v_floor_f16 OPS16
+//      v_floor_f32 OPS32
+//      v_floor_f64 OPS64
+//      v_fract_f16 OPS16
+//      v_fract_f32 OPS32
+//      v_fract_f64 OPS64
+//      v_frexp_exp_i16_f16 OPS16
+//      v_frexp_exp_i32_f32 OPS32
+//      v_frexp_exp_i32_f64 OPS_32_64
+//      v_frexp_mant_f16 OPS16
+//      v_frexp_mant_f32 OPS32
+//      v_frexp_mant_f64 OPS64
+//      v_log_f16 OPS16
+//      v_log_f32 OPS32
+//      v_mov_b16_e32 OPS16
+//      v_mov_b16_e64 OPS16
+//      v_mov_b32 OPS32
+//      v_movreld_b32 OPS32
+//      v_movrels_b32 v5, v1
+//      v_movrels_b32 v255, v255
+//      v_movrelsd_2_b32 v5, v1
+//      v_movrelsd_2_b32 v255, v255
+//      v_movrelsd_b32 v5, v1
+//      v_movrelsd_b32 v255, v255
+//      v_nop
+//      v_not_b16 OPS16
+//      v_not_b32 OPS32
+//      v_permlane64_b32 v5, v1
+//      v_permlane64_b32 v255, v255
+//      v_pipeflush
+//      v_rcp_f16 OPS16
+//      v_rcp_f32 OPS32
+//      v_rcp_f64 OPS64
+//      v_rcp_iflag_f32 OPS32
+//      v_readfirstlane_b32 s5, v1
+//      v_readfirstlane_b32 s105, v1
+//      v_readfirstlane_b32 vcc_lo, v1
+//      v_readfirstlane_b32 vcc_hi, v1
+//      v_readfirstlane_b32 ttmp15, v1
+//      v_readfirstlane_b32 null, v255
+//      v_rndne_f16 OPS16
+//      v_rndne_f32 OPS32
+//      v_rndne_f64 OPS64
+//      v_rsq_f16 OPS16
+//      v_rsq_f32 OPS32
+//      v_rsq_f64 OPS64
+//      v_sat_pk_u8_i16 v5.l, SRC32
+//      v_sat_pk_u8_i16 v127.l, 0xfe0b
+//      v_sat_pk_u8_i16 v127.l, 0.5
+//      v_sat_pk_u8_i16 v5.h, src_scc
+//      v_sat_pk_u8_i16 v127.h, 0xfe0b
+//      v_sin_f16 OPS16
+//      v_sin_f32 OPS32
+//      v_sqrt_f16 OPS16
+//      v_sqrt_f32 OPS32
+//      v_sqrt_f64 OPS64
+//      v_swap_b16 v5.l, v1.h
+//      v_swap_b16 v5.h, v1.l
+//      v_swap_b16 v127.l, v127.l
+//      v_swap_b32 v5, v1
+//      v_swap_b32 v255, v255
+//      v_swaprel_b32 v5, v1
+//      v_swaprel_b32 v255, v255
+//      v_trunc_f16 OPS16
+//      v_trunc_f32 OPS32
+//      v_trunc_f64 OPS64
+//
+//  SRC16=
+//      v1.l
+//      v127.l
+//      v1.h
+//      v127.h
+//      s1
+//      s105
+//      vcc_lo
+//      vcc_hi
+//      ttmp15
+//      m0
+//      exec_lo
+//      exec_hi
+//      null
+//      -1
+//      0.5
+//      src_scc
+//
+//  OPS16=
+//      v5.l, SRC16
+//      v5.l, 0xfe0b
+//      v5.h, src_scc
+//      v127.h, 0xfe0b
+//
+//  SRC32=
+//      v1
+//      v255
+//      s1
+//      s105
+//      vcc_lo
+//      vcc_hi
+//      ttmp15
+//      m0
+//      exec_lo
+//      exec_hi
+//      null
+//      -1
+//      0.5
+//      src_scc
+//
+//  OPS32=
+//      v5, SRC32
+//      v255, 0xaf123456
+//
+//  SRC64=
+//      v[1:2]
+//      v[254:255]
+//      s[2:3]
+//      s[104:105]
+//      vcc
+//      ttmp[14:15]
+//      exec
+//      null
+//      -1
+//      0.5
+//      src_scc
+//
+//  OPS64=
+//      v[5:6], SRC64
+//      v[254:255], 0xaf123456
+//
+//  OPS_32_16=
+//      v5, SRC16
+//      v255, 0xfe0b
+//
+//  OPS_32_64=
+//      v5, SRC64
+//      v255, 0xaf123456
+//
+//  OPS_64_32=
+//      v[5:6], SRC32
+//      v[254:255], 0xaf123456
+
+v_bfrev_b32 v5, v1
 // GFX11: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
 
 v_bfrev_b32 v5, v255
@@ -89,8 +276,14 @@ v_ceil_f16 v5.l, null
 v_ceil_f16 v5.l, -1
 // GFX11: v_ceil_f16_e32 v5.l, -1                 ; encoding: [0xc1,0xb8,0x0a,0x7e]
 
-v_ceil_f16 v127.l, 0.5
-// GFX11: v_ceil_f16_e32 v127.l, 0.5              ; encoding: [0xf0,0xb8,0xfe,0x7e]
+v_ceil_f16 v5.l, 0.5
+// GFX11: v_ceil_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5.l, src_scc
+// GFX11: v_ceil_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0xb8,0x0a,0x7e]
+
+v_ceil_f16 v5.l, 0xfe0b
+// GFX11: v_ceil_f16_e32 v5.l, 0xfe0b             ; encoding: [0xff,0xb8,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_ceil_f16 v5.h, src_scc
 // GFX11: v_ceil_f16_e32 v5.h, src_scc            ; encoding: [0xfd,0xb8,0x0a,0x7f]
@@ -275,6 +468,12 @@ v_cos_f16 v5.l, v1.l
 v_cos_f16 v5.l, v127.l
 // GFX11: v_cos_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc3,0x0a,0x7e]
 
+v_cos_f16 v5.l, v1.h
+// GFX11: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
+
+v_cos_f16 v5.l, v127.h
+// GFX11: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
+
 v_cos_f16 v5.l, s1
 // GFX11: v_cos_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc2,0x0a,0x7e]
 
@@ -311,17 +510,8 @@ v_cos_f16 v5.l, 0.5
 v_cos_f16 v5.l, src_scc
 // GFX11: v_cos_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7e]
 
-v_cos_f16 v127.l, 0xfe0b
-// GFX11: v_cos_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_cos_f16 v5.l, v1.h
-// GFX11: v_cos_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc3,0x0a,0x7e]
-
-v_cos_f16 v5.l, v127.h
-// GFX11: v_cos_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc3,0x0a,0x7e]
-
-v_cos_f16 v127.l, 0.5
-// GFX11: v_cos_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc2,0xfe,0x7e]
+v_cos_f16 v5.l, 0xfe0b
+// GFX11: v_cos_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xc2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cos_f16 v5.h, src_scc
 // GFX11: v_cos_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc2,0x0a,0x7f]
@@ -458,8 +648,8 @@ v_cvt_f16_f32 v5.l, -1
 v_cvt_f16_f32 v5.l, 0.5
 // GFX11: v_cvt_f16_f32_e32 v5.l, 0.5             ; encoding: [0xf0,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v5.h, src_scc
-// GFX11: v_cvt_f16_f32_e32 v5.h, src_scc         ; encoding: [0xfd,0x14,0x0a,0x7f]
+v_cvt_f16_f32 v5.l, src_scc
+// GFX11: v_cvt_f16_f32_e32 v5.l, src_scc         ; encoding: [0xfd,0x14,0x0a,0x7e]
 
 v_cvt_f16_f32 v127.h, 0xaf123456
 // GFX11: v_cvt_f16_f32_e32 v127.h, 0xaf123456    ; encoding: [0xff,0x14,0xfe,0x7f,0x56,0x34,0x12,0xaf]
@@ -509,12 +699,15 @@ v_cvt_f16_i16 v5.l, null
 v_cvt_f16_i16 v5.l, -1
 // GFX11: v_cvt_f16_i16_e32 v5.l, -1              ; encoding: [0xc1,0xa2,0x0a,0x7e]
 
-v_cvt_f16_i16 v127.l, 0.5
-// GFX11: v_cvt_f16_i16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa2,0xfe,0x7e]
-
 v_cvt_f16_i16 v5.l, 0.5
 // GFX11: v_cvt_f16_i16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa2,0x0a,0x7e]
 
+v_cvt_f16_i16 v5.l, src_scc
+// GFX11: v_cvt_f16_i16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa2,0x0a,0x7e]
+
+v_cvt_f16_i16 v5.l, 0xfe0b
+// GFX11: v_cvt_f16_i16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
 v_cvt_f16_i16 v5.h, src_scc
 // GFX11: v_cvt_f16_i16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa2,0x0a,0x7f]
 
@@ -563,11 +756,14 @@ v_cvt_f16_u16 v5.l, null
 v_cvt_f16_u16 v5.l, -1
 // GFX11: v_cvt_f16_u16_e32 v5.l, -1              ; encoding: [0xc1,0xa0,0x0a,0x7e]
 
-v_cvt_f16_u16 v127.l, 0.5
-// GFX11: v_cvt_f16_u16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa0,0xfe,0x7e]
+v_cvt_f16_u16 v5.l, 0.5
+// GFX11: v_cvt_f16_u16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa0,0x0a,0x7e]
+
+v_cvt_f16_u16 v5.l, src_scc
+// GFX11: v_cvt_f16_u16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa0,0x0a,0x7e]
 
-v_cvt_f16_u16 v5, 0.5
-// GFX11: v_cvt_f16_u16_e32 v5, 0.5               ; encoding: [0xf0,0xa0,0x0a,0x7e]
+v_cvt_f16_u16 v5.l, 0xfe0b
+// GFX11: v_cvt_f16_u16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f16_u16 v5.h, src_scc
 // GFX11: v_cvt_f16_u16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa0,0x0a,0x7f]
@@ -1199,8 +1395,14 @@ v_cvt_i16_f16 v5.l, null
 v_cvt_i16_f16 v5.l, -1
 // GFX11: v_cvt_i16_f16_e32 v5.l, -1              ; encoding: [0xc1,0xa6,0x0a,0x7e]
 
-v_cvt_i16_f16 v127.l, 0.5
-// GFX11: v_cvt_i16_f16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa6,0xfe,0x7e]
+v_cvt_i16_f16 v5.l, 0.5
+// GFX11: v_cvt_i16_f16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5.l, src_scc
+// GFX11: v_cvt_i16_f16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa6,0x0a,0x7e]
+
+v_cvt_i16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_i16_f16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa6,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_i16_f16 v5.h, src_scc
 // GFX11: v_cvt_i16_f16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa6,0x0a,0x7f]
@@ -1295,6 +1497,12 @@ v_cvt_i32_i16 v5, v1.l
 v_cvt_i32_i16 v5, v127.l
 // GFX11: v_cvt_i32_i16_e32 v5, v127.l            ; encoding: [0x7f,0xd5,0x0a,0x7e]
 
+v_cvt_i32_i16 v5, v1.h
+// GFX11: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
+
+v_cvt_i32_i16 v5, v127.h
+// GFX11: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
+
 v_cvt_i32_i16 v5, s1
 // GFX11: v_cvt_i32_i16_e32 v5, s1                ; encoding: [0x01,0xd4,0x0a,0x7e]
 
@@ -1334,12 +1542,6 @@ v_cvt_i32_i16 v5, src_scc
 v_cvt_i32_i16 v255, 0xfe0b
 // GFX11: v_cvt_i32_i16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd4,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_cvt_i32_i16 v5, v1.h
-// GFX11: v_cvt_i32_i16_e32 v5, v1.h              ; encoding: [0x81,0xd5,0x0a,0x7e]
-
-v_cvt_i32_i16 v5, v127.h
-// GFX11: v_cvt_i32_i16_e32 v5, v127.h            ; encoding: [0xff,0xd5,0x0a,0x7e]
-
 v_cvt_nearest_i32_f32 v5, v1
 // GFX11: v_cvt_nearest_i32_f32_e32 v5, v1        ; encoding: [0x01,0x19,0x0a,0x7e]
 
@@ -1427,8 +1629,14 @@ v_cvt_norm_i16_f16 v5.l, null
 v_cvt_norm_i16_f16 v5.l, -1
 // GFX11: v_cvt_norm_i16_f16_e32 v5.l, -1         ; encoding: [0xc1,0xc6,0x0a,0x7e]
 
-v_cvt_norm_i16_f16 v127.l, 0.5
-// GFX11: v_cvt_norm_i16_f16_e32 v127.l, 0.5      ; encoding: [0xf0,0xc6,0xfe,0x7e]
+v_cvt_norm_i16_f16 v5.l, 0.5
+// GFX11: v_cvt_norm_i16_f16_e32 v5.l, 0.5        ; encoding: [0xf0,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5.l, src_scc
+// GFX11: v_cvt_norm_i16_f16_e32 v5.l, src_scc    ; encoding: [0xfd,0xc6,0x0a,0x7e]
+
+v_cvt_norm_i16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_norm_i16_f16_e32 v5.l, 0xfe0b     ; encoding: [0xff,0xc6,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_i16_f16 v5.h, src_scc
 // GFX11: v_cvt_norm_i16_f16_e32 v5.h, src_scc    ; encoding: [0xfd,0xc6,0x0a,0x7f]
@@ -1478,8 +1686,14 @@ v_cvt_norm_u16_f16 v5.l, null
 v_cvt_norm_u16_f16 v5.l, -1
 // GFX11: v_cvt_norm_u16_f16_e32 v5.l, -1         ; encoding: [0xc1,0xc8,0x0a,0x7e]
 
-v_cvt_norm_u16_f16 v127.l, 0.5
-// GFX11: v_cvt_norm_u16_f16_e32 v127.l, 0.5      ; encoding: [0xf0,0xc8,0xfe,0x7e]
+v_cvt_norm_u16_f16 v5.l, 0.5
+// GFX11: v_cvt_norm_u16_f16_e32 v5.l, 0.5        ; encoding: [0xf0,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5.l, src_scc
+// GFX11: v_cvt_norm_u16_f16_e32 v5.l, src_scc    ; encoding: [0xfd,0xc8,0x0a,0x7e]
+
+v_cvt_norm_u16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_norm_u16_f16_e32 v5.l, 0xfe0b     ; encoding: [0xff,0xc8,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_u16_f16 v5.h, src_scc
 // GFX11: v_cvt_norm_u16_f16_e32 v5.h, src_scc    ; encoding: [0xfd,0xc8,0x0a,0x7f]
@@ -1619,8 +1833,14 @@ v_cvt_u16_f16 v5.l, null
 v_cvt_u16_f16 v5.l, -1
 // GFX11: v_cvt_u16_f16_e32 v5.l, -1              ; encoding: [0xc1,0xa4,0x0a,0x7e]
 
-v_cvt_u16_f16 v127.l, 0.5
-// GFX11: v_cvt_u16_f16_e32 v127.l, 0.5           ; encoding: [0xf0,0xa4,0xfe,0x7e]
+v_cvt_u16_f16 v5.l, 0.5
+// GFX11: v_cvt_u16_f16_e32 v5.l, 0.5             ; encoding: [0xf0,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5.l, src_scc
+// GFX11: v_cvt_u16_f16_e32 v5.l, src_scc         ; encoding: [0xfd,0xa4,0x0a,0x7e]
+
+v_cvt_u16_f16 v5.l, 0xfe0b
+// GFX11: v_cvt_u16_f16_e32 v5.l, 0xfe0b          ; encoding: [0xff,0xa4,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_cvt_u16_f16 v5.h, src_scc
 // GFX11: v_cvt_u16_f16_e32 v5.h, src_scc         ; encoding: [0xfd,0xa4,0x0a,0x7f]
@@ -1715,6 +1935,12 @@ v_cvt_u32_u16 v5, v1.l
 v_cvt_u32_u16 v5, v127.l
 // GFX11: v_cvt_u32_u16_e32 v5, v127.l            ; encoding: [0x7f,0xd7,0x0a,0x7e]
 
+v_cvt_u32_u16 v5, v1.h
+// GFX11: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
+
+v_cvt_u32_u16 v5, v127.h
+// GFX11: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
+
 v_cvt_u32_u16 v5, s1
 // GFX11: v_cvt_u32_u16_e32 v5, s1                ; encoding: [0x01,0xd6,0x0a,0x7e]
 
@@ -1754,12 +1980,6 @@ v_cvt_u32_u16 v5, src_scc
 v_cvt_u32_u16 v255, 0xfe0b
 // GFX11: v_cvt_u32_u16_e32 v255, 0xfe0b          ; encoding: [0xff,0xd6,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
 
-v_cvt_u32_u16 v5, v1.h
-// GFX11: v_cvt_u32_u16_e32 v5, v1.h              ; encoding: [0x81,0xd7,0x0a,0x7e]
-
-v_cvt_u32_u16 v5, v127.h
-// GFX11: v_cvt_u32_u16_e32 v5, v127.h            ; encoding: [0xff,0xd7,0x0a,0x7e]
-
 v_exp_f16 v5.l, v1.l
 // GFX11: v_exp_f16_e32 v5.l, v1.l                ; encoding: [0x01,0xb1,0x0a,0x7e]
 
@@ -1802,8 +2022,14 @@ v_exp_f16 v5.l, null
 v_exp_f16 v5.l, -1
 // GFX11: v_exp_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xb0,0x0a,0x7e]
 
-v_exp_f16 v127.l, 0.5
-// GFX11: v_exp_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xb0,0xfe,0x7e]
+v_exp_f16 v5.l, 0.5
+// GFX11: v_exp_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5.l, src_scc
+// GFX11: v_exp_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xb0,0x0a,0x7e]
+
+v_exp_f16 v5.l, 0xfe0b
+// GFX11: v_exp_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xb0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_exp_f16 v5.h, src_scc
 // GFX11: v_exp_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xb0,0x0a,0x7f]
@@ -2033,8 +2259,14 @@ v_floor_f16 v5.l, null
 v_floor_f16 v5.l, -1
 // GFX11: v_floor_f16_e32 v5.l, -1                ; encoding: [0xc1,0xb6,0x0a,0x7e]
 
-v_floor_f16 v127.l, 0.5
-// GFX11: v_floor_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xb6,0xfe,0x7e]
+v_floor_f16 v5.l, 0.5
+// GFX11: v_floor_f16_e32 v5.l, 0.5               ; encoding: [0xf0,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5.l, src_scc
+// GFX11: v_floor_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xb6,0x0a,0x7e]
+
+v_floor_f16 v5.l, 0xfe0b
+// GFX11: v_floor_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xb6,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_floor_f16 v5.h, src_scc
 // GFX11: v_floor_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xb6,0x0a,0x7f]
@@ -2129,6 +2361,12 @@ v_fract_f16 v5.l, v1.l
 v_fract_f16 v5.l, v127.l
 // GFX11: v_fract_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbf,0x0a,0x7e]
 
+v_fract_f16 v5.l, v1.h
+// GFX11: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
+
+v_fract_f16 v5.l, v127.h
+// GFX11: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
+
 v_fract_f16 v5.l, s1
 // GFX11: v_fract_f16_e32 v5.l, s1                ; encoding: [0x01,0xbe,0x0a,0x7e]
 
@@ -2165,17 +2403,8 @@ v_fract_f16 v5.l, 0.5
 v_fract_f16 v5.l, src_scc
 // GFX11: v_fract_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7e]
 
-v_fract_f16 v127.l, 0xfe0b
-// GFX11: v_fract_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbe,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_fract_f16 v5.l, v1.h
-// GFX11: v_fract_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbf,0x0a,0x7e]
-
-v_fract_f16 v5.l, v127.h
-// GFX11: v_fract_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbf,0x0a,0x7e]
-
-v_fract_f16 v127.l, 0.5
-// GFX11: v_fract_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbe,0xfe,0x7e]
+v_fract_f16 v5.l, 0xfe0b
+// GFX11: v_fract_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xbe,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_fract_f16 v5.h, src_scc
 // GFX11: v_fract_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbe,0x0a,0x7f]
@@ -2306,8 +2535,14 @@ v_frexp_exp_i16_f16 v5.l, null
 v_frexp_exp_i16_f16 v5.l, -1
 // GFX11: v_frexp_exp_i16_f16_e32 v5.l, -1        ; encoding: [0xc1,0xb4,0x0a,0x7e]
 
-v_frexp_exp_i16_f16 v127.l, 0.5
-// GFX11: v_frexp_exp_i16_f16_e32 v127.l, 0.5     ; encoding: [0xf0,0xb4,0xfe,0x7e]
+v_frexp_exp_i16_f16 v5.l, 0.5
+// GFX11: v_frexp_exp_i16_f16_e32 v5.l, 0.5       ; encoding: [0xf0,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5.l, src_scc
+// GFX11: v_frexp_exp_i16_f16_e32 v5.l, src_scc   ; encoding: [0xfd,0xb4,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5.l, 0xfe0b
+// GFX11: v_frexp_exp_i16_f16_e32 v5.l, 0xfe0b    ; encoding: [0xff,0xb4,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_frexp_exp_i16_f16 v5.h, src_scc
 // GFX11: v_frexp_exp_i16_f16_e32 v5.h, src_scc   ; encoding: [0xfd,0xb4,0x0a,0x7f]
@@ -2402,6 +2637,12 @@ v_frexp_mant_f16 v5.l, v1.l
 v_frexp_mant_f16 v5.l, v127.l
 // GFX11: v_frexp_mant_f16_e32 v5.l, v127.l       ; encoding: [0x7f,0xb3,0x0a,0x7e]
 
+v_frexp_mant_f16 v5.l, v1.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
+
+v_frexp_mant_f16 v5.l, v127.h
+// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
+
 v_frexp_mant_f16 v5.l, s1
 // GFX11: v_frexp_mant_f16_e32 v5.l, s1           ; encoding: [0x01,0xb2,0x0a,0x7e]
 
@@ -2438,17 +2679,8 @@ v_frexp_mant_f16 v5.l, 0.5
 v_frexp_mant_f16 v5.l, src_scc
 // GFX11: v_frexp_mant_f16_e32 v5.l, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7e]
 
-v_frexp_mant_f16 v127.l, 0xfe0b
-// GFX11: v_frexp_mant_f16_e32 v127.l, 0xfe0b     ; encoding: [0xff,0xb2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_frexp_mant_f16 v5.l, v1.h
-// GFX11: v_frexp_mant_f16_e32 v5.l, v1.h         ; encoding: [0x81,0xb3,0x0a,0x7e]
-
-v_frexp_mant_f16 v5.l, v127.h
-// GFX11: v_frexp_mant_f16_e32 v5.l, v127.h       ; encoding: [0xff,0xb3,0x0a,0x7e]
-
-v_frexp_mant_f16 v127.l, 0.5
-// GFX11: v_frexp_mant_f16_e32 v127.l, 0.5        ; encoding: [0xf0,0xb2,0xfe,0x7e]
+v_frexp_mant_f16 v5.l, 0xfe0b
+// GFX11: v_frexp_mant_f16_e32 v5.l, 0xfe0b       ; encoding: [0xff,0xb2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f16 v5.h, src_scc
 // GFX11: v_frexp_mant_f16_e32 v5.h, src_scc      ; encoding: [0xfd,0xb2,0x0a,0x7f]
@@ -2579,8 +2811,14 @@ v_log_f16 v5.l, null
 v_log_f16 v5.l, -1
 // GFX11: v_log_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xae,0x0a,0x7e]
 
-v_log_f16 v127.l, 0.5
-// GFX11: v_log_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xae,0xfe,0x7e]
+v_log_f16 v5.l, 0.5
+// GFX11: v_log_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xae,0x0a,0x7e]
+
+v_log_f16 v5.l, src_scc
+// GFX11: v_log_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xae,0x0a,0x7e]
+
+v_log_f16 v5.l, 0xfe0b
+// GFX11: v_log_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xae,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_log_f16 v5.h, src_scc
 // GFX11: v_log_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xae,0x0a,0x7f]
@@ -2633,35 +2871,119 @@ v_log_f32 v5, src_scc
 v_log_f32 v255, 0xaf123456
 // GFX11: v_log_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x4e,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_mov_b16_e32 v0.l, v1.l
-// GFX11: v_mov_b16_e32 v0.l, v1.l                ; encoding: [0x01,0x39,0x00,0x7e]
+v_mov_b16_e32 v5.l, v1.l
+// GFX11: v_mov_b16_e32 v5.l, v1.l                ; encoding: [0x01,0x39,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, v127.l
+// GFX11: v_mov_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0x39,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, v1.h
+// GFX11: v_mov_b16_e32 v5.l, v1.h                ; encoding: [0x81,0x39,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, v127.h
+// GFX11: v_mov_b16_e32 v5.l, v127.h              ; encoding: [0xff,0x39,0x0a,0x7e]
 
-v_mov_b16_e32 v0.l, s1
-// GFX11: v_mov_b16_e32 v0.l, s1                  ; encoding: [0x01,0x38,0x00,0x7e]
+v_mov_b16_e32 v5.l, s1
+// GFX11: v_mov_b16_e32 v5.l, s1                  ; encoding: [0x01,0x38,0x0a,0x7e]
 
-v_mov_b16_e32 v0.h, 0
-// GFX11: v_mov_b16_e32 v0.h, 0                   ; encoding: [0x80,0x38,0x00,0x7f]
+v_mov_b16_e32 v5.l, s105
+// GFX11: v_mov_b16_e32 v5.l, s105                ; encoding: [0x69,0x38,0x0a,0x7e]
 
-v_mov_b16_e32 v0.h, 1.0
-// GFX11: v_mov_b16_e32 v0.h, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7f]
+v_mov_b16_e32 v5.l, vcc_lo
+// GFX11: v_mov_b16_e32 v5.l, vcc_lo              ; encoding: [0x6a,0x38,0x0a,0x7e]
 
-v_mov_b16_e32 v0.l, 0x1234
-// GFX11: v_mov_b16_e32 v0.l, 0x1234              ; encoding: [0xff,0x38,0x00,0x7e,0x34,0x12,0x00,0x00]
+v_mov_b16_e32 v5.l, vcc_hi
+// GFX11: v_mov_b16_e32 v5.l, vcc_hi              ; encoding: [0x6b,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v0.l, v1.l
-// GFX11: v_mov_b16_e64 v0.l, v1.l                ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x01,0x00,0x00]
+v_mov_b16_e32 v5.l, ttmp15
+// GFX11: v_mov_b16_e32 v5.l, ttmp15              ; encoding: [0x7b,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v200.l, v1.h
-// GFX11: v_mov_b16_e64 v200.l, v1.h op_sel:[1,0] ; encoding: [0xc8,0x08,0x9c,0xd5,0x01,0x01,0x00,0x00]
+v_mov_b16_e32 v5.l, m0
+// GFX11: v_mov_b16_e32 v5.l, m0                  ; encoding: [0x7d,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v0.l, s1
-// GFX11: v_mov_b16_e64 v0.l, s1                  ; encoding: [0x00,0x00,0x9c,0xd5,0x01,0x00,0x00,0x00]
+v_mov_b16_e32 v5.l, exec_lo
+// GFX11: v_mov_b16_e32 v5.l, exec_lo             ; encoding: [0x7e,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v200.h, 1
-// GFX11: v_mov_b16_e64 v200.h, 1 op_sel:[0,1]    ; encoding: [0xc8,0x40,0x9c,0xd5,0x81,0x00,0x00,0x00]
+v_mov_b16_e32 v5.l, exec_hi
+// GFX11: v_mov_b16_e32 v5.l, exec_hi             ; encoding: [0x7f,0x38,0x0a,0x7e]
 
-v_mov_b16_e64 v0.l, 0x1234
-// GFX11: v_mov_b16_e64 v0.l, 0x1234              ; encoding: [0x00,0x00,0x9c,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+v_mov_b16_e32 v5.l, null
+// GFX11: v_mov_b16_e32 v5.l, null                ; encoding: [0x7c,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, -1
+// GFX11: v_mov_b16_e32 v5.l, -1                  ; encoding: [0xc1,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, 0.5
+// GFX11: v_mov_b16_e32 v5.l, 0.5                 ; encoding: [0xf0,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, src_scc
+// GFX11: v_mov_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0x38,0x0a,0x7e]
+
+v_mov_b16_e32 v5.l, 0xfe0b
+// GFX11: v_mov_b16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0x38,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_mov_b16_e32 v5.h, src_scc
+// GFX11: v_mov_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0x38,0x0a,0x7f]
+
+v_mov_b16_e32 v127.h, 0xfe0b
+// GFX11: v_mov_b16_e32 v127.h, 0xfe0b            ; encoding: [0xff,0x38,0xfe,0x7f,0x0b,0xfe,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v1.l
+// GFX11: v_mov_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0x9c,0xd5,0x01,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v127.l
+// GFX11: v_mov_b16_e64 v5.l, v127.l              ; encoding: [0x05,0x00,0x9c,0xd5,0x7f,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v1.h
+// GFX11: v_mov_b16_e64 v5.l, v1.h op_sel:[1,0]   ; encoding: [0x05,0x08,0x9c,0xd5,0x01,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, v127.h
+// GFX11: v_mov_b16_e64 v5.l, v127.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9c,0xd5,0x7f,0x01,0x00,0x00]
+
+v_mov_b16_e64 v5.l, s1
+// GFX11: v_mov_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0x9c,0xd5,0x01,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, s105
+// GFX11: v_mov_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0x9c,0xd5,0x69,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, vcc_lo
+// GFX11: v_mov_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0x9c,0xd5,0x6a,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, vcc_hi
+// GFX11: v_mov_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0x9c,0xd5,0x6b,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, ttmp15
+// GFX11: v_mov_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0x9c,0xd5,0x7b,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, m0
+// GFX11: v_mov_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0x9c,0xd5,0x7d,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, exec_lo
+// GFX11: v_mov_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0x9c,0xd5,0x7e,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, exec_hi
+// GFX11: v_mov_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0x9c,0xd5,0x7f,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, null
+// GFX11: v_mov_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0x9c,0xd5,0x7c,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, -1
+// GFX11: v_mov_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0x9c,0xd5,0xc1,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, 0.5
+// GFX11: v_mov_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0x9c,0xd5,0xf0,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, src_scc
+// GFX11: v_mov_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0x9c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_mov_b16_e64 v5.l, 0xfe0b
+// GFX11: v_mov_b16_e64 v5.l, 0xfe0b              ; encoding: [0x05,0x00,0x9c,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_mov_b16_e64 v5.h, src_scc
+// GFX11: v_mov_b16_e64 v5.h, src_scc op_sel:[0,1] ; encoding: [0x05,0x40,0x9c,0xd5,0xfd,0x00,0x00,0x00]
+
+v_mov_b16_e64 v127.h, 0xfe0b
+// GFX11: v_mov_b16_e64 v127.h, 0xfe0b op_sel:[0,1] ; encoding: [0x7f,0x40,0x9c,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_mov_b32 v5, v1
 // GFX11: v_mov_b32_e32 v5, v1                    ; encoding: [0x01,0x03,0x0a,0x7e]
@@ -2780,6 +3102,12 @@ v_not_b16 v5.l, v1.l
 v_not_b16 v5.l, v127.l
 // GFX11: v_not_b16_e32 v5.l, v127.l              ; encoding: [0x7f,0xd3,0x0a,0x7e]
 
+v_not_b16 v5.l, v1.h
+// GFX11: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
+
+v_not_b16 v5.l, v127.h
+// GFX11: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
+
 v_not_b16 v5.l, s1
 // GFX11: v_not_b16_e32 v5.l, s1                  ; encoding: [0x01,0xd2,0x0a,0x7e]
 
@@ -2816,17 +3144,8 @@ v_not_b16 v5.l, 0.5
 v_not_b16 v5.l, src_scc
 // GFX11: v_not_b16_e32 v5.l, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7e]
 
-v_not_b16 v127.l, 0xfe0b
-// GFX11: v_not_b16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xd2,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_not_b16 v5.l, v1.h
-// GFX11: v_not_b16_e32 v5.l, v1.h                ; encoding: [0x81,0xd3,0x0a,0x7e]
-
-v_not_b16 v5.l, v127.h
-// GFX11: v_not_b16_e32 v5.l, v127.h              ; encoding: [0xff,0xd3,0x0a,0x7e]
-
-v_not_b16 v127.l, 0.5
-// GFX11: v_not_b16_e32 v127.l, 0.5               ; encoding: [0xf0,0xd2,0xfe,0x7e]
+v_not_b16 v5.l, 0xfe0b
+// GFX11: v_not_b16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xd2,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_not_b16 v5.h, src_scc
 // GFX11: v_not_b16_e32 v5.h, src_scc             ; encoding: [0xfd,0xd2,0x0a,0x7f]
@@ -2930,8 +3249,14 @@ v_rcp_f16 v5.l, null
 v_rcp_f16 v5.l, -1
 // GFX11: v_rcp_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xa8,0x0a,0x7e]
 
-v_rcp_f16 v127.l, 0.5
-// GFX11: v_rcp_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xa8,0xfe,0x7e]
+v_rcp_f16 v5.l, 0.5
+// GFX11: v_rcp_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5.l, src_scc
+// GFX11: v_rcp_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xa8,0x0a,0x7e]
+
+v_rcp_f16 v5.l, 0xfe0b
+// GFX11: v_rcp_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xa8,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rcp_f16 v5.h, src_scc
 // GFX11: v_rcp_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xa8,0x0a,0x7f]
@@ -3089,6 +3414,12 @@ v_rndne_f16 v5.l, v1.l
 v_rndne_f16 v5.l, v127.l
 // GFX11: v_rndne_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbd,0x0a,0x7e]
 
+v_rndne_f16 v5.l, v1.h
+// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
+
+v_rndne_f16 v5.l, v127.h
+// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
+
 v_rndne_f16 v5.l, s1
 // GFX11: v_rndne_f16_e32 v5.l, s1                ; encoding: [0x01,0xbc,0x0a,0x7e]
 
@@ -3125,17 +3456,8 @@ v_rndne_f16 v5.l, 0.5
 v_rndne_f16 v5.l, src_scc
 // GFX11: v_rndne_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7e]
 
-v_rndne_f16 v127.l, 0xfe0b
-// GFX11: v_rndne_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xbc,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_rndne_f16 v5.l, v1.h
-// GFX11: v_rndne_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbd,0x0a,0x7e]
-
-v_rndne_f16 v5.l, v127.h
-// GFX11: v_rndne_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbd,0x0a,0x7e]
-
-v_rndne_f16 v127.l, 0.5
-// GFX11: v_rndne_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xbc,0xfe,0x7e]
+v_rndne_f16 v5.l, 0xfe0b
+// GFX11: v_rndne_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xbc,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f16 v5.h, src_scc
 // GFX11: v_rndne_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xbc,0x0a,0x7f]
@@ -3266,8 +3588,14 @@ v_rsq_f16 v5.l, null
 v_rsq_f16 v5.l, -1
 // GFX11: v_rsq_f16_e32 v5.l, -1                  ; encoding: [0xc1,0xac,0x0a,0x7e]
 
-v_rsq_f16 v127.l, 0.5
-// GFX11: v_rsq_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xac,0xfe,0x7e]
+v_rsq_f16 v5.l, 0.5
+// GFX11: v_rsq_f16_e32 v5.l, 0.5                 ; encoding: [0xf0,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5.l, src_scc
+// GFX11: v_rsq_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xac,0x0a,0x7e]
+
+v_rsq_f16 v5.l, 0xfe0b
+// GFX11: v_rsq_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xac,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_rsq_f16 v5.h, src_scc
 // GFX11: v_rsq_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xac,0x0a,0x7f]
@@ -3416,6 +3744,12 @@ v_sin_f16 v5.l, v1.l
 v_sin_f16 v5.l, v127.l
 // GFX11: v_sin_f16_e32 v5.l, v127.l              ; encoding: [0x7f,0xc1,0x0a,0x7e]
 
+v_sin_f16 v5.l, v1.h
+// GFX11: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
+
+v_sin_f16 v5.l, v127.h
+// GFX11: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
+
 v_sin_f16 v5.l, s1
 // GFX11: v_sin_f16_e32 v5.l, s1                  ; encoding: [0x01,0xc0,0x0a,0x7e]
 
@@ -3452,17 +3786,8 @@ v_sin_f16 v5.l, 0.5
 v_sin_f16 v5.l, src_scc
 // GFX11: v_sin_f16_e32 v5.l, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7e]
 
-v_sin_f16 v127.l, 0xfe0b
-// GFX11: v_sin_f16_e32 v127.l, 0xfe0b            ; encoding: [0xff,0xc0,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_sin_f16 v5.l, v1.h
-// GFX11: v_sin_f16_e32 v5.l, v1.h                ; encoding: [0x81,0xc1,0x0a,0x7e]
-
-v_sin_f16 v5.l, v127.h
-// GFX11: v_sin_f16_e32 v5.l, v127.h              ; encoding: [0xff,0xc1,0x0a,0x7e]
-
-v_sin_f16 v127.l, 0.5
-// GFX11: v_sin_f16_e32 v127.l, 0.5               ; encoding: [0xf0,0xc0,0xfe,0x7e]
+v_sin_f16 v5.l, 0xfe0b
+// GFX11: v_sin_f16_e32 v5.l, 0xfe0b              ; encoding: [0xff,0xc0,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sin_f16 v5.h, src_scc
 // GFX11: v_sin_f16_e32 v5.h, src_scc             ; encoding: [0xfd,0xc0,0x0a,0x7f]
@@ -3557,8 +3882,14 @@ v_sqrt_f16 v5.l, null
 v_sqrt_f16 v5.l, -1
 // GFX11: v_sqrt_f16_e32 v5.l, -1                 ; encoding: [0xc1,0xaa,0x0a,0x7e]
 
-v_sqrt_f16 v127.l, 0.5
-// GFX11: v_sqrt_f16_e32 v127.l, 0.5              ; encoding: [0xf0,0xaa,0xfe,0x7e]
+v_sqrt_f16 v5.l, 0.5
+// GFX11: v_sqrt_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5.l, src_scc
+// GFX11: v_sqrt_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0xaa,0x0a,0x7e]
+
+v_sqrt_f16 v5.l, 0xfe0b
+// GFX11: v_sqrt_f16_e32 v5.l, 0xfe0b             ; encoding: [0xff,0xaa,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_sqrt_f16 v5.h, src_scc
 // GFX11: v_sqrt_f16_e32 v5.h, src_scc            ; encoding: [0xfd,0xaa,0x0a,0x7f]
@@ -3674,6 +4005,12 @@ v_trunc_f16 v5.l, v1.l
 v_trunc_f16 v5.l, v127.l
 // GFX11: v_trunc_f16_e32 v5.l, v127.l            ; encoding: [0x7f,0xbb,0x0a,0x7e]
 
+v_trunc_f16 v5.l, v1.h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v5.l, v127.h
+// GFX11: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
+
 v_trunc_f16 v5.l, s1
 // GFX11: v_trunc_f16_e32 v5.l, s1                ; encoding: [0x01,0xba,0x0a,0x7e]
 
@@ -3710,17 +4047,8 @@ v_trunc_f16 v5.l, 0.5
 v_trunc_f16 v5.l, src_scc
 // GFX11: v_trunc_f16_e32 v5.l, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7e]
 
-v_trunc_f16 v127.l, 0xfe0b
-// GFX11: v_trunc_f16_e32 v127.l, 0xfe0b          ; encoding: [0xff,0xba,0xfe,0x7e,0x0b,0xfe,0x00,0x00]
-
-v_trunc_f16 v5.l, v1.h
-// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
-
-v_trunc_f16 v5.l, v127.h
-// GFX11: v_trunc_f16_e32 v5.l, v127.h            ; encoding: [0xff,0xbb,0x0a,0x7e]
-
-v_trunc_f16 v127.l, 0.5
-// GFX11: v_trunc_f16_e32 v127.l, 0.5             ; encoding: [0xf0,0xba,0xfe,0x7e]
+v_trunc_f16 v5.l, 0xfe0b
+// GFX11: v_trunc_f16_e32 v5.l, 0xfe0b            ; encoding: [0xff,0xba,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f16 v5.h, src_scc
 // GFX11: v_trunc_f16_e32 v5.h, src_scc           ; encoding: [0xfd,0xba,0x0a,0x7f]
@@ -3808,9 +4136,3 @@ v_trunc_f64 v[5:6], src_scc
 
 v_trunc_f64 v[254:255], 0xaf123456
 // GFX11: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
-
-v_trunc_f16 v[5].l, v[1].h
-// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
-
-v_trunc_f16 v[5:5].l, v[1:1].h
-// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 5bff2a2e6b208..246a8c42ba912 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -555,12 +555,10 @@ cont2:
 ;        *ptr = 4;
 ;    }
 ;  }
-;
-; FIXME: %ptr should be dereferenceable(4)
 define dso_local void @rec-branch-1(i32 %a, i32 %b, i32 %c, ptr %ptr) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; CHECK-LABEL: define {{[^@]+}}@rec-branch-1
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree writeonly captures(none) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE3:%.*]], label [[IF_THEN:%.*]]
@@ -630,11 +628,10 @@ if.end8:                                          ; preds = %if.then5, %if.else6
 ;        rec-branch-2(1, 1, 1, ptr);
 ;    }
 ;  }
-; FIXME: %ptr should be dereferenceable(4)
 define dso_local void @rec-branch-2(i32 %a, i32 %b, i32 %c, ptr %ptr) {
 ; CHECK: Function Attrs: nofree nosync nounwind memory(argmem: write)
 ; CHECK-LABEL: define {{[^@]+}}@rec-branch-2
-; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree writeonly captures(none) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE3:%.*]], label [[IF_THEN:%.*]]
@@ -654,7 +651,7 @@ define dso_local void @rec-branch-2(i32 %a, i32 %b, i32 %c, ptr %ptr) {
 ; CHECK-NEXT:    store i32 3, ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[IF_END8]]
 ; CHECK:       if.else6:
-; CHECK-NEXT:    tail call void @rec-branch-2(i32 noundef 1, i32 noundef 1, i32 noundef 1, ptr nofree writeonly captures(none) [[PTR]]) #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:    tail call void @rec-branch-2(i32 noundef 1, i32 noundef 1, i32 noundef 1, ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]]) #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    br label [[IF_END8]]
 ; CHECK:       if.end8:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
index 2ff8a3fa3a688..57a6d09af64fa 100644
--- a/llvm/test/Transforms/Attributor/nonnull.ll
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -32,16 +32,27 @@ define ptr @test2(ptr nonnull %p) {
 }
 
 define ptr @test2A(i1 %c, ptr %ret) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
-; CHECK-LABEL: define {{[^@]+}}@test2A
-; CHECK-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
-; CHECK:       A:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16:[0-9]+]] [ "nonnull"(ptr [[RET]]) ]
-; CHECK-NEXT:    ret ptr [[RET]]
-; CHECK:       B:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nonnull"(ptr [[RET]]) ]
-; CHECK-NEXT:    ret ptr [[RET]]
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; TUNIT-LABEL: define {{[^@]+}}@test2A
+; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2:[0-9]+]] {
+; TUNIT-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; TUNIT:       A:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15:[0-9]+]] [ "nonnull"(ptr [[RET]]) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+; TUNIT:       B:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "nonnull"(ptr [[RET]]) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; CGSCC-LABEL: define {{[^@]+}}@test2A
+; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2:[0-9]+]] {
+; CGSCC-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; CGSCC:       A:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16:[0-9]+]] [ "nonnull"(ptr [[RET]]) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
+; CGSCC:       B:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nonnull"(ptr [[RET]]) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
 ;
   br i1 %c, label %A, label %B
 A:
@@ -53,16 +64,27 @@ B:
 }
 
 define ptr @test2B(i1 %c, ptr %ret) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
-; CHECK-LABEL: define {{[^@]+}}@test2B
-; CHECK-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned dereferenceable(4) "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
-; CHECK:       A:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
-; CHECK-NEXT:    ret ptr [[RET]]
-; CHECK:       B:
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
-; CHECK-NEXT:    ret ptr [[RET]]
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; TUNIT-LABEL: define {{[^@]+}}@test2B
+; TUNIT-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned dereferenceable(4) "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; TUNIT:       A:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+; TUNIT:       B:
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; TUNIT-NEXT:    ret ptr [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; CGSCC-LABEL: define {{[^@]+}}@test2B
+; CGSCC-SAME: (i1 noundef [[C:%.*]], ptr nofree nonnull readnone returned dereferenceable(4) "no-capture-maybe-returned" [[RET:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:    br i1 [[C]], label [[A:%.*]], label [[B:%.*]]
+; CGSCC:       A:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
+; CGSCC:       B:
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "dereferenceable"(ptr [[RET]], i32 4) ]
+; CGSCC-NEXT:    ret ptr [[RET]]
 ;
   br i1 %c, label %A, label %B
 A:
@@ -273,13 +295,21 @@ define ptr @test9(ptr %a, i64 %n) {
 ; ATTRIBUTOR_OPM: define ptr @test10
 ; ATTRIBUTOR_NPM: define nonnull ptr @test10
 define ptr @test10(ptr %a, i64 %n) {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
-; CHECK-LABEL: define {{[^@]+}}@test10
-; CHECK-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[A:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[CMP]]) #[[ATTR16]]
-; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
-; CHECK-NEXT:    ret ptr [[B]]
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; TUNIT-LABEL: define {{[^@]+}}@test10
+; TUNIT-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[A:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef [[CMP]]) #[[ATTR15]]
+; TUNIT-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
+; TUNIT-NEXT:    ret ptr [[B]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write)
+; CGSCC-LABEL: define {{[^@]+}}@test10
+; CGSCC-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[A:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
+; CGSCC-NEXT:    call void @llvm.assume(i1 noundef [[CMP]]) #[[ATTR16]]
+; CGSCC-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
+; CGSCC-NEXT:    ret ptr [[B]]
 ;
   %cmp = icmp ne i64 %n, 0
   call void @llvm.assume(i1 %cmp)
@@ -392,50 +422,22 @@ declare nonnull ptr @nonnull()
 
 
 define internal ptr @f1(ptr %arg) {
-; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg)
-; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: read)
-; TUNIT-LABEL: define {{[^@]+}}@f1
-; TUNIT-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR6:[0-9]+]] {
-; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
-; TUNIT-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
-; TUNIT:       bb1:
-; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
-; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
-; TUNIT-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
-; TUNIT:       bb4:
-; TUNIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; TUNIT-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR17:[0-9]+]]
-; TUNIT-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
-; TUNIT-NEXT:    br label [[BB9]]
-; TUNIT:       bb6:
-; TUNIT-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    ret ptr [[TMP7]]
-; TUNIT:       bb9:
-; TUNIT-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
-; TUNIT-NEXT:    ret ptr [[TMP10]]
-;
-; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read)
 ; CGSCC-LABEL: define {{[^@]+}}@f1
-; CGSCC-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree nonnull readonly align 4 captures(none) dereferenceable(4) [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
-; CGSCC-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
-; CGSCC-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
+; CGSCC-NEXT:    br label [[BB1:%.*]]
 ; CGSCC:       bb1:
-; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4
+; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARG]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
 ; CGSCC:       bb4:
-; CGSCC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; CGSCC-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR17:[0-9]+]]
-; CGSCC-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
-; CGSCC-NEXT:    br label [[BB9]]
+; CGSCC-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr undef, i64 -1
+; CGSCC-NEXT:    br label [[BB9:%.*]]
 ; CGSCC:       bb6:
-; CGSCC-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    ret ptr [[TMP7]]
+; CGSCC-NEXT:    ret ptr undef
 ; CGSCC:       bb9:
-; CGSCC-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
-; CGSCC-NEXT:    ret ptr [[TMP10]]
+; CGSCC-NEXT:    ret ptr undef
 ;
 
 bb:
@@ -463,19 +465,11 @@ bb9:                                              ; preds = %bb4, %bb
 }
 
 define internal ptr @f2(ptr %arg) {
-; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: read)
-; TUNIT-LABEL: define {{[^@]+}}@f2
-; TUNIT-SAME: (ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR6]] {
-; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    ret ptr [[TMP]]
-;
-; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@f2
-; CGSCC-SAME: (ptr nofree nonnull readonly align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR5]] {
+; CGSCC-SAME: (ptr noalias nofree nonnull readnone align 4 captures(none) dereferenceable(4) [[ARG:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CGSCC-NEXT:  bb:
-; CGSCC-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    ret ptr [[TMP]]
+; CGSCC-NEXT:    ret ptr undef
 ;
 bb:
   %tmp = tail call ptr @f1(ptr %arg)
@@ -484,19 +478,17 @@ bb:
 
 define dso_local noalias ptr @f3(ptr %arg) {
 ; FIXME: missing nonnull. It should be nonnull @f3(ptr nonnull readonly %arg)
-; TUNIT: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; TUNIT: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@f3
-; TUNIT-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR6]] {
+; TUNIT-SAME: (ptr nofree readnone captures(none) [[ARG:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:  bb:
-; TUNIT-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; TUNIT-NEXT:    ret ptr [[TMP]]
+; TUNIT-NEXT:    ret ptr undef
 ;
-; CGSCC: Function Attrs: nofree nosync nounwind memory(argmem: read)
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@f3
-; CGSCC-SAME: (ptr nofree readonly [[ARG:%.*]]) #[[ATTR5]] {
+; CGSCC-SAME: (ptr nofree readnone captures(none) [[ARG:%.*]]) #[[ATTR1]] {
 ; CGSCC-NEXT:  bb:
-; CGSCC-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR17]]
-; CGSCC-NEXT:    ret ptr [[TMP]]
+; CGSCC-NEXT:    ret ptr undef
 ;
 bb:
 ; FIXME: missing nonnull. It should be @f1(ptr nonnull readonly %arg)
@@ -529,26 +521,26 @@ declare void @fun3(ptr, ptr, ptr) #1
 define void @f16(ptr %a, ptr %b, i8 %c) {
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@f16
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
-; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR7:[0-9]+]]
+; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR6:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ; TUNIT:       if.else:
-; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@f16
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
-; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR6:[0-9]+]]
+; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR7:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ; CGSCC:       if.else:
-; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -571,32 +563,32 @@ define void @f17(ptr %a, i8 %c) {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@f17
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT:%.*]]
 ; TUNIT:       if.else:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT]]
 ; TUNIT:       cont:
-; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@f17
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT:%.*]]
 ; CGSCC:       if.else:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT]]
 ; CGSCC:       cont:
-; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -625,50 +617,50 @@ cont:
 define void @f18(ptr %a, ptr %b, i8 %c) {
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@f18
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[C]], 0
 ; TUNIT-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT:%.*]]
 ; TUNIT:       if.else:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT]]
 ; TUNIT:       cont:
 ; TUNIT-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 1
 ; TUNIT-NEXT:    br i1 [[CMP2]], label [[CONT_THEN:%.*]], label [[CONT_ELSE:%.*]]
 ; TUNIT:       cont.then:
-; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT2:%.*]]
 ; TUNIT:       cont.else:
-; TUNIT-NEXT:    tail call void @fun0() #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun0() #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[CONT2]]
 ; TUNIT:       cont2:
-; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@f18
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[C]], 0
 ; CGSCC-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT:%.*]]
 ; CGSCC:       if.else:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT]]
 ; CGSCC:       cont:
 ; CGSCC-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 1
 ; CGSCC-NEXT:    br i1 [[CMP2]], label [[CONT_THEN:%.*]], label [[CONT_ELSE:%.*]]
 ; CGSCC:       cont.then:
-; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT2:%.*]]
 ; CGSCC:       cont.else:
-; CGSCC-NEXT:    tail call void @fun0() #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun0() #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[CONT2]]
 ; CGSCC:       cont2:
-; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret void
 ;
   %cmp1 = icmp eq i8 %c, 0
@@ -857,11 +849,17 @@ define i8 @parent6(ptr %a, ptr %b) {
 ; The nonnull callsite is guaranteed to execute, so the argument must be nonnull throughout the parent.
 
 define i8 @parent7(ptr %a) {
-; CHECK-LABEL: define {{[^@]+}}@parent7
-; CHECK-SAME: (ptr nonnull [[A:%.*]]) {
-; CHECK-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull readonly [[A]]) #[[ATTR18:[0-9]+]]
-; CHECK-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
-; CHECK-NEXT:    ret i8 [[RET]]
+; TUNIT-LABEL: define {{[^@]+}}@parent7
+; TUNIT-SAME: (ptr nonnull [[A:%.*]]) {
+; TUNIT-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull readonly [[A]]) #[[ATTR16:[0-9]+]]
+; TUNIT-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
+; TUNIT-NEXT:    ret i8 [[RET]]
+;
+; CGSCC-LABEL: define {{[^@]+}}@parent7
+; CGSCC-SAME: (ptr nonnull [[A:%.*]]) {
+; CGSCC-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull readonly [[A]]) #[[ATTR17:[0-9]+]]
+; CGSCC-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
+; CGSCC-NEXT:    ret i8 [[RET]]
 ;
 
 
@@ -931,13 +929,13 @@ define ptr @gep1_no_null_opt(ptr %p) #0 {
 ; Should't be able to derive nonnull based on gep.
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@gep1_no_null_opt
-; TUNIT-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; TUNIT-NEXT:    [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1
 ; TUNIT-NEXT:    ret ptr [[Q]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@gep1_no_null_opt
-; CGSCC-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR9:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree readnone "no-capture-maybe-returned" [[P:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CGSCC-NEXT:    [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1
 ; CGSCC-NEXT:    ret ptr [[Q]]
 ;
@@ -983,8 +981,8 @@ define ptr @g1() {
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@g1
-; CGSCC-SAME: () #[[ATTR10:[0-9]+]] {
-; CGSCC-NEXT:    [[C:%.*]] = call noundef nonnull align 4 ptr @g2() #[[ATTR19:[0-9]+]]
+; CGSCC-SAME: () #[[ATTR6]] {
+; CGSCC-NEXT:    [[C:%.*]] = call noundef nonnull align 4 ptr @g2() #[[ATTR18:[0-9]+]]
 ; CGSCC-NEXT:    ret ptr [[C]]
 ;
   %c = call ptr @g2()
@@ -1045,21 +1043,32 @@ define internal void @control(ptr dereferenceable(4) %a) {
 }
 ; Avoid nonnull as we do not touch naked functions
 define internal void @naked(ptr dereferenceable(4) %a) naked {
-; CHECK: Function Attrs: naked
-; CHECK-LABEL: define {{[^@]+}}@naked
-; CHECK-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: naked
+; TUNIT-LABEL: define {{[^@]+}}@naked
+; TUNIT-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: naked
+; CGSCC-LABEL: define {{[^@]+}}@naked
+; CGSCC-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
+; CGSCC-NEXT:    ret void
 ;
   ret void
 }
 ; Avoid nonnull as we do not touch optnone
 define internal void @optnone(ptr dereferenceable(4) %a) optnone noinline {
 ;
-; CHECK: Function Attrs: noinline optnone
-; CHECK-LABEL: define {{[^@]+}}@optnone
-; CHECK-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
-; CHECK-NEXT:    call void @use_i32_ptr(ptr nofree noundef nonnull captures(none) [[A]])
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: noinline optnone
+; TUNIT-LABEL: define {{[^@]+}}@optnone
+; TUNIT-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
+; TUNIT-NEXT:    call void @use_i32_ptr(ptr nofree noundef nonnull captures(none) [[A]])
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: noinline optnone
+; CGSCC-LABEL: define {{[^@]+}}@optnone
+; CGSCC-SAME: (ptr noundef nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
+; CGSCC-NEXT:    call void @use_i32_ptr(ptr nofree noundef nonnull captures(none) [[A]])
+; CGSCC-NEXT:    ret void
 ;
   call void @use_i32_ptr(ptr %a)
   ret void
@@ -1098,32 +1107,32 @@ define i32 @nonnull_exec_ctx_1(ptr %a, i32 %b) {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
-; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
 ; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
 ; TUNIT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; TUNIT-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
-; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
 ; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
 ; CGSCC-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; CGSCC-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
@@ -1148,16 +1157,16 @@ define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
-; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; TUNIT:       ex:
-; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    ret i32 [[TMP5]]
 ; TUNIT:       hd:
 ; TUNIT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
+; TUNIT-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
 ; TUNIT-NEXT:    br label [[HD2]]
 ; TUNIT:       hd2:
 ; TUNIT-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
@@ -1166,16 +1175,16 @@ define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
-; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
 ; CGSCC:       ex:
-; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    ret i32 [[TMP5]]
 ; CGSCC:       hd:
 ; CGSCC-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR6]]
+; CGSCC-NEXT:    tail call void @h(ptr [[A]]) #[[ATTR7]]
 ; CGSCC-NEXT:    br label [[HD2]]
 ; CGSCC:       hd2:
 ; CGSCC-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
@@ -1205,7 +1214,7 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1221,7 +1230,7 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1255,7 +1264,7 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; TUNIT: Function Attrs: mustprogress nounwind willreturn
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
-; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
 ; TUNIT-NEXT:  en:
 ; TUNIT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; TUNIT-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1273,7 +1282,7 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ;
 ; CGSCC: Function Attrs: mustprogress nounwind willreturn
 ; CGSCC-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
-; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; CGSCC-SAME: (ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
 ; CGSCC-NEXT:  en:
 ; CGSCC-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; CGSCC-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1392,8 +1401,8 @@ declare ptr @strrchr(ptr %0, i32 %1) nofree nounwind readonly willreturn
 define ptr @mybasename(ptr nofree readonly %str) {
 ; TUNIT: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read)
 ; TUNIT-LABEL: define {{[^@]+}}@mybasename
-; TUNIT-SAME: (ptr nofree readonly [[STR:%.*]]) #[[ATTR14:[0-9]+]] {
-; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR19:[0-9]+]]
+; TUNIT-SAME: (ptr nofree readonly [[STR:%.*]]) #[[ATTR13:[0-9]+]] {
+; TUNIT-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR17:[0-9]+]]
 ; TUNIT-NEXT:    [[TOBOOL:%.*]] = icmp ne ptr [[CALL]], null
 ; TUNIT-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 1
 ; TUNIT-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], ptr [[ADD_PTR]], ptr [[STR]]
@@ -1402,7 +1411,7 @@ define ptr @mybasename(ptr nofree readonly %str) {
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@mybasename
 ; CGSCC-SAME: (ptr nofree readonly [[STR:%.*]]) #[[ATTR14:[0-9]+]] {
-; CGSCC-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR20:[0-9]+]]
+; CGSCC-NEXT:    [[CALL:%.*]] = call ptr @strrchr(ptr nofree readonly [[STR]], i32 noundef 47) #[[ATTR19:[0-9]+]]
 ; CGSCC-NEXT:    [[TOBOOL:%.*]] = icmp ne ptr [[CALL]], null
 ; CGSCC-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 1
 ; CGSCC-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], ptr [[ADD_PTR]], ptr [[STR]]
@@ -1425,7 +1434,7 @@ define void @nonnull_assume_pos(ptr %arg) {
 ;
 ; TUNIT-LABEL: define {{[^@]+}}@nonnull_assume_pos
 ; TUNIT-SAME: (ptr nofree nonnull readnone captures(none) [[ARG:%.*]]) {
-; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR16]] [ "nonnull"(ptr [[ARG]]) ]
+; TUNIT-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR15]] [ "nonnull"(ptr [[ARG]]) ]
 ; TUNIT-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[ARG]]) #[[ATTR5]]
 ; TUNIT-NEXT:    [[TMP1:%.*]] = call ptr @unknown()
 ; TUNIT-NEXT:    ret void
@@ -1554,14 +1563,14 @@ define void @phi_caller(ptr %p) {
 ; TUNIT: Function Attrs: nounwind
 ; TUNIT-LABEL: define {{[^@]+}}@phi_caller
 ; TUNIT-SAME: (ptr nofree [[P:%.*]]) #[[ATTR5]] {
-; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR20:[0-9]+]]
+; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR18:[0-9]+]]
 ; TUNIT-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR5]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nounwind
 ; CGSCC-LABEL: define {{[^@]+}}@phi_caller
 ; CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR21:[0-9]+]]
+; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @phi(ptr noalias nofree readnone [[P]]) #[[ATTR20:[0-9]+]]
 ; CGSCC-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR4]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -1594,14 +1603,14 @@ define void @multi_ret_caller(ptr %p) {
 ; TUNIT: Function Attrs: nounwind
 ; TUNIT-LABEL: define {{[^@]+}}@multi_ret_caller
 ; TUNIT-SAME: (ptr nofree [[P:%.*]]) #[[ATTR5]] {
-; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR20]]
+; TUNIT-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR18]]
 ; TUNIT-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR5]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nounwind
 ; CGSCC-LABEL: define {{[^@]+}}@multi_ret_caller
 ; CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR4]] {
-; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR21]]
+; CGSCC-NEXT:    [[C:%.*]] = call nonnull ptr @multi_ret(ptr noalias nofree readnone [[P]]) #[[ATTR20]]
 ; CGSCC-NEXT:    call void @use_i8_ptr(ptr noalias nofree nonnull readnone captures(none) [[C]]) #[[ATTR4]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -1613,18 +1622,31 @@ define void @multi_ret_caller(ptr %p) {
 ; From https://github.com/llvm/llvm-project/pull/85810
 @G = internal global i64 1, align 8
 define dso_local ptr @update_global_in_alive_bb() {
-; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
-; CHECK-LABEL: define {{[^@]+}}@update_global_in_alive_bb
-; CHECK-SAME: () #[[ATTR15:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @G, align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i64 0, ptr @G, align 8
-; CHECK-NEXT:    ret ptr inttoptr (i64 5 to ptr)
-; CHECK:       if.else:
-; CHECK-NEXT:    ret ptr null
+; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+; TUNIT-LABEL: define {{[^@]+}}@update_global_in_alive_bb
+; TUNIT-SAME: () #[[ATTR14:[0-9]+]] {
+; TUNIT-NEXT:  entry:
+; TUNIT-NEXT:    [[TMP0:%.*]] = load i64, ptr @G, align 8
+; TUNIT-NEXT:    [[CMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; TUNIT:       if.then:
+; TUNIT-NEXT:    store i64 0, ptr @G, align 8
+; TUNIT-NEXT:    ret ptr inttoptr (i64 5 to ptr)
+; TUNIT:       if.else:
+; TUNIT-NEXT:    ret ptr null
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
+; CGSCC-LABEL: define {{[^@]+}}@update_global_in_alive_bb
+; CGSCC-SAME: () #[[ATTR15:[0-9]+]] {
+; CGSCC-NEXT:  entry:
+; CGSCC-NEXT:    [[TMP0:%.*]] = load i64, ptr @G, align 8
+; CGSCC-NEXT:    [[CMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CGSCC:       if.then:
+; CGSCC-NEXT:    store i64 0, ptr @G, align 8
+; CGSCC-NEXT:    ret ptr inttoptr (i64 5 to ptr)
+; CGSCC:       if.else:
+; CGSCC-NEXT:    ret ptr null
 ;
 entry:
   %0 = load i64, ptr @G, align 8
@@ -1640,48 +1662,47 @@ if.else:
 attributes #0 = { null_pointer_is_valid }
 attributes #1 = { nounwind willreturn}
 ;.
-; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write) }
-; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR4]] = { noreturn }
-; TUNIT: attributes #[[ATTR5]] = { nounwind }
-; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind memory(argmem: read) }
-; TUNIT: attributes #[[ATTR7]] = { nounwind willreturn }
-; TUNIT: attributes #[[ATTR8]] = { mustprogress nounwind willreturn }
-; TUNIT: attributes #[[ATTR9:[0-9]+]] = { nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
-; TUNIT: attributes #[[ATTR11]] = { naked }
-; TUNIT: attributes #[[ATTR12]] = { noinline optnone }
-; TUNIT: attributes #[[ATTR13:[0-9]+]] = { nofree nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR14]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR15]] = { mustprogress nofree norecurse nosync nounwind willreturn }
-; TUNIT: attributes #[[ATTR16]] = { nofree willreturn memory(write) }
-; TUNIT: attributes #[[ATTR17]] = { nofree nosync nounwind memory(read) }
-; TUNIT: attributes #[[ATTR18]] = { nosync willreturn memory(read) }
-; TUNIT: attributes #[[ATTR19]] = { nofree nosync willreturn memory(read) }
-; TUNIT: attributes #[[ATTR20]] = { nofree nosync nounwind willreturn memory(none) }
-;.
 ; CGSCC: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write) }
 ; CGSCC: attributes #[[ATTR3]] = { noreturn }
 ; CGSCC: attributes #[[ATTR4]] = { nounwind }
-; CGSCC: attributes #[[ATTR5]] = { nofree nosync nounwind memory(argmem: read) }
-; CGSCC: attributes #[[ATTR6]] = { nounwind willreturn }
-; CGSCC: attributes #[[ATTR7]] = { mustprogress nounwind willreturn }
-; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nounwind willreturn memory(read) }
-; CGSCC: attributes #[[ATTR9]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
-; CGSCC: attributes #[[ATTR10]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: read) }
+; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; CGSCC: attributes #[[ATTR7]] = { nounwind willreturn }
+; CGSCC: attributes #[[ATTR8]] = { mustprogress nounwind willreturn }
+; CGSCC: attributes #[[ATTR9:[0-9]+]] = { nounwind willreturn memory(read) }
+; CGSCC: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR11]] = { naked }
 ; CGSCC: attributes #[[ATTR12]] = { noinline optnone }
 ; CGSCC: attributes #[[ATTR13:[0-9]+]] = { nofree nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR14]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR15]] = { mustprogress nofree norecurse nosync nounwind willreturn }
 ; CGSCC: attributes #[[ATTR16]] = { nofree willreturn memory(write) }
-; CGSCC: attributes #[[ATTR17]] = { nofree nosync nounwind memory(read) }
-; CGSCC: attributes #[[ATTR18]] = { nosync willreturn memory(read) }
-; CGSCC: attributes #[[ATTR19]] = { nofree nosync willreturn }
-; CGSCC: attributes #[[ATTR20]] = { nofree nosync willreturn memory(read) }
-; CGSCC: attributes #[[ATTR21]] = { nofree willreturn }
+; CGSCC: attributes #[[ATTR17]] = { nosync willreturn memory(read) }
+; CGSCC: attributes #[[ATTR18]] = { nofree nosync willreturn }
+; CGSCC: attributes #[[ATTR19]] = { nofree nosync willreturn memory(read) }
+; CGSCC: attributes #[[ATTR20]] = { nofree willreturn }
+;.
+; TUNIT: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+; TUNIT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: write) }
+; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR4]] = { noreturn }
+; TUNIT: attributes #[[ATTR5]] = { nounwind }
+; TUNIT: attributes #[[ATTR6]] = { nounwind willreturn }
+; TUNIT: attributes #[[ATTR7]] = { mustprogress nounwind willreturn }
+; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR9]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
+; TUNIT: attributes #[[ATTR10]] = { naked }
+; TUNIT: attributes #[[ATTR11]] = { noinline optnone }
+; TUNIT: attributes #[[ATTR12:[0-9]+]] = { nofree nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR13]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR14]] = { mustprogress nofree norecurse nosync nounwind willreturn }
+; TUNIT: attributes #[[ATTR15]] = { nofree willreturn memory(write) }
+; TUNIT: attributes #[[ATTR16]] = { nosync willreturn memory(read) }
+; TUNIT: attributes #[[ATTR17]] = { nofree nosync willreturn memory(read) }
+; TUNIT: attributes #[[ATTR18]] = { nofree nosync nounwind willreturn memory(none) }
+;.
+; CGSCC: [[META0]] = !{}
 ;.
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index 3e07fe42261e9..2235f194af8ea 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -1267,7 +1267,7 @@ entry:
 define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn
 ; TUNIT-LABEL: define void @noalias_arg_simplifiable_2(
-; TUNIT-SAME: ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
+; TUNIT-SAME: ptr nofree nonnull captures(none) dereferenceable(24) [[BYTES:%.*]]) #[[ATTR3]] {
 ; TUNIT-NEXT:  [[ENTRY:.*]]:
 ; TUNIT-NEXT:    br label %[[FOR_COND:.*]]
 ; TUNIT:       [[FOR_COND]]:
@@ -1344,7 +1344,7 @@ define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn
 ; CGSCC-LABEL: define void @noalias_arg_simplifiable_2(
-; CGSCC-SAME: ptr nofree captures(none) [[BYTES:%.*]]) #[[ATTR3]] {
+; CGSCC-SAME: ptr nofree nonnull align 4 captures(none) dereferenceable(1024) [[BYTES:%.*]]) #[[ATTR3]] {
 ; CGSCC-NEXT:  [[ENTRY:.*]]:
 ; CGSCC-NEXT:    br label %[[FOR_COND:.*]]
 ; CGSCC:       [[FOR_COND]]:
@@ -1399,7 +1399,7 @@ define void @noalias_arg_simplifiable_2(ptr %Bytes) {
 ; CGSCC-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 1023
 ; CGSCC-NEXT:    store i8 0, ptr [[ARRAYIDX24]], align 1, !tbaa [[CHAR_TBAA15]]
 ; CGSCC-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, ptr [[BYTES]], i64 500
-; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[ARRAYIDX25]], i32 noundef 0) #[[ATTR21]]
+; CGSCC-NEXT:    call void @write_arg(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(524) [[ARRAYIDX25]], i32 noundef 0) #[[ATTR21]]
 ; CGSCC-NEXT:    br label %[[FOR_COND27:.*]]
 ; CGSCC:       [[FOR_COND27]]:
 ; CGSCC-NEXT:    [[INDVARS_IV12:%.*]] = phi i64 [ [[INDVARS_IV_NEXT13:%.*]], %[[FOR_INC35:.*]] ], [ 0, %[[FOR_END23]] ]
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index d65480b05759a..543f33ee0621b 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -238,7 +238,7 @@ define void @only_exit() local_unnamed_addr #0 {
 define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_addr #0 {
 ; CHECK: Function Attrs: noinline nounwind uwtable
 ; CHECK-LABEL: define {{[^@]+}}@conditional_exit
-; CHECK-SAME: (i32 [[TMP0:%.*]], ptr nofree readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr nofree nonnull readonly align 4 captures(none) dereferenceable(4) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
 ; CHECK:       4:
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index 9d5ae1606f2e3..e06fb1cfd9656 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -360,7 +360,6 @@ declare nonnull ptr @nonnull()
 
 
 define internal ptr @f1(ptr %arg) {
-; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg)
 ; FNATTRS-LABEL: define internal nonnull ptr @f1(
 ; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
 ; FNATTRS-NEXT:  bb:
@@ -383,7 +382,7 @@ define internal ptr @f1(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP10]]
 ;
 ; ATTRIBUTOR-LABEL: define internal ptr @f1(
-; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nofree nonnull readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 3b016f8d0a9ff..63348ccf94f78 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -44,7 +44,7 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 false)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -125,7 +125,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -187,7 +187,7 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -256,7 +256,7 @@ define i64 @loop_contains_safe_div() #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, [[INDEX1]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 false)
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -336,7 +336,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -483,12 +483,12 @@ exit:
 define i64 @same_exit_block_requires_interleaving() {
 ; CHECK-LABEL: define i64 @same_exit_block_requires_interleaving() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [128 x %my.struct], align 8
+; CHECK-NEXT:    [[P1:%.*]] = alloca [128 x [[MY_STRUCT:%.*]]], align 8
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 256)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x %my.struct], ptr [[P1]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x [[MY_STRUCT]]], ptr [[P1]], i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_LATCH]], label [[LOOP_END:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
index b40a184a3e425..c56f8327a48b3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
@@ -79,20 +79,20 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 16
-; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
+; CHECK-NEXT:    [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 false)
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP40]], 3
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[TMP42]], [[TMP41]]
-; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
+; CHECK-NEXT:    [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 false)
 ; CHECK-NEXT:    [[TMP45:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp ne i64 [[TMP44]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
+; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 false)
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP40]], 1
 ; CHECK-NEXT:    [[TMP51:%.*]] = add i64 [[TMP50]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne i64 [[TMP49]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i64 [[TMP51]], i64 [[TMP48]]
-; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 false)
 ; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP40]], 0
 ; CHECK-NEXT:    [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = icmp ne i64 [[TMP61]], [[TMP40]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
index 794e274a2628c..f11f35319b8fc 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
@@ -31,9 +31,9 @@ define noundef i32 @f(i32 noundef %g) {
 ; VF4IC2:       [[MIDDLE_BLOCK]]:
 ; VF4IC2-NEXT:    br label %[[RETURN:.*]]
 ; VF4IC2:       [[VECTOR_EARLY_EXIT]]:
-; VF4IC2-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; VF4IC2-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; VF4IC2-NEXT:    [[TMP10:%.*]] = add i64 4, [[TMP9]]
-; VF4IC2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; VF4IC2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; VF4IC2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF4IC2-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP11]], 4
 ; VF4IC2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 [[TMP10]]
@@ -64,7 +64,7 @@ define noundef i32 @f(i32 noundef %g) {
 ; VF8IC1:       [[MIDDLE_BLOCK]]:
 ; VF8IC1-NEXT:    br label %[[RETURN:.*]]
 ; VF8IC1:       [[VECTOR_EARLY_EXIT]]:
-; VF8IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
+; VF8IC1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 false)
 ; VF8IC1-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
 ; VF8IC1-NEXT:    [[TMP7:%.*]] = add i32 0, [[TMP6]]
 ; VF8IC1-NEXT:    br label %[[RETURN]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 03b7ed7fe2135..0bc2748b6252d 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -28,7 +28,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[LOOP_END:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
 ; CHECK-NEXT:    br label %[[LOOP_END]]
 ; CHECK:       [[LOOP_END]]:
@@ -140,7 +140,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero(ptr n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[LOOP_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    br label %[[LOOP_END_LOOPEXIT]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -336,7 +336,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
@@ -431,7 +431,7 @@ define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %fi
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
@@ -525,7 +525,7 @@ define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last)
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
@@ -602,7 +602,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[LOOP_END:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
 ; CHECK-NEXT:    br label %[[LOOP_END]]
 ; CHECK:       [[LOOP_END]]:
@@ -740,7 +740,7 @@ define i64 @find_if_pointer_distance_deref_via_assumption(ptr %vec) nofree nosyn
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[BEGIN]], i64 [[TMP13]]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index ed5dcc78eeb78..053863117bdc8 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -124,17 +124,17 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -211,17 +211,17 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 true)
+; VF4IC4-NEXT:    [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 false)
 ; VF4IC4-NEXT:    [[TMP16:%.*]] = add i64 12, [[TMP15]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP28]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP28]], i1 false)
 ; VF4IC4-NEXT:    [[TMP18:%.*]] = add i64 8, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP19:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP18]], i64 [[TMP16]]
-; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP14]], i1 true)
+; VF4IC4-NEXT:    [[TMP21:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP14]], i1 false)
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = add i64 4, [[TMP21]]
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP21]], 4
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i64 [[TMP22]], i64 [[TMP20]]
-; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[TMP25:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = add i64 0, [[TMP25]]
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = icmp ne i64 [[TMP25]], 4
 ; VF4IC4-NEXT:    [[TMP6:%.*]] = select i1 [[TMP27]], i64 [[TMP26]], i64 [[TMP24]]
@@ -304,17 +304,17 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -401,17 +401,17 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -503,17 +503,17 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 true)
+; VF4IC4-NEXT:    [[TMP20:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP35]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 12, [[TMP20]]
-; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[TMP22:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = add i64 8, [[TMP22]]
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP22]], 4
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[TMP23]], i64 [[TMP21]]
-; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[TMP26:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 4, [[TMP26]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP26]], 4
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP25]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 0, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
@@ -623,17 +623,17 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP43]], i1 true)
+; VF4IC4-NEXT:    [[TMP28:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP43]], i1 false)
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 12, [[TMP28]]
-; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[TMP30:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 false)
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = add i64 8, [[TMP30]]
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = icmp ne i64 [[TMP30]], 4
 ; VF4IC4-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
-; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 false)
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = add i64 4, [[TMP34]]
 ; VF4IC4-NEXT:    [[TMP36:%.*]] = icmp ne i64 [[TMP34]], 4
 ; VF4IC4-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i64 [[TMP35]], i64 [[TMP33]]
-; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 false)
 ; VF4IC4-NEXT:    [[TMP39:%.*]] = add i64 0, [[TMP38]]
 ; VF4IC4-NEXT:    [[TMP40:%.*]] = icmp ne i64 [[TMP38]], 4
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP40]], i64 [[TMP39]], i64 [[TMP37]]
@@ -734,17 +734,17 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[LOOP_END:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; VF4IC4-NEXT:    [[TMP20:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; VF4IC4-NEXT:    [[TMP21:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE8]]
 ; VF4IC4-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE8]], 4
 ; VF4IC4-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 [[TMP20]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP29]], i1 false)
 ; VF4IC4-NEXT:    [[TMP24:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE9]]
 ; VF4IC4-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE9]], 4
 ; VF4IC4-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i64 [[TMP24]], i64 [[TMP23]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP12]], i1 false)
 ; VF4IC4-NEXT:    [[TMP27:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE1]], 4
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP28]], i64 [[TMP27]], i64 [[TMP26]]
@@ -861,17 +861,17 @@ define i8 @same_exit_block_reverse_use_loaded_value() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP37]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP37]], i1 false)
 ; VF4IC4-NEXT:    [[TMP28:%.*]] = add i64 12, [[FIRST_ACTIVE_LANE]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP20]], i1 false)
 ; VF4IC4-NEXT:    [[TMP29:%.*]] = add i64 8, [[FIRST_ACTIVE_LANE15]]
 ; VF4IC4-NEXT:    [[TMP30:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE15]], 4
 ; VF4IC4-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i64 [[TMP29]], i64 [[TMP28]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 false)
 ; VF4IC4-NEXT:    [[TMP32:%.*]] = add i64 4, [[FIRST_ACTIVE_LANE16]]
 ; VF4IC4-NEXT:    [[TMP33:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE16]], 4
 ; VF4IC4-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i64 [[TMP32]], i64 [[TMP31]]
-; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 true)
+; VF4IC4-NEXT:    [[FIRST_ACTIVE_LANE1:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP21]], i1 false)
 ; VF4IC4-NEXT:    [[TMP35:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE1]]
 ; VF4IC4-NEXT:    [[TMP36:%.*]] = icmp ne i64 [[FIRST_ACTIVE_LANE1]], 4
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = select i1 [[TMP36]], i64 [[TMP35]], i64 [[TMP34]]
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 4fd8d17073de4..ae03f2426a800 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -424,7 +424,7 @@ define i64 @loop_guard_needed_to_prove_dereferenceable(i32 %x, i1 %cmp2) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       scalar.ph:
@@ -572,7 +572,7 @@ define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr derefe
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[IV_NEXT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 79821b8be1734..55682bc410527 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -32,7 +32,7 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -96,7 +96,7 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[DOTCAST]], 2
@@ -160,7 +160,7 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i64 [[FIRST_ACTIVE_LANE]] to i128
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i128 [[INDEX1]], [[TMP8]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i128 [[TMP9]] to i32
@@ -226,7 +226,7 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[TMP10]] to float
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float 1.000000e+00, [[DOTCAST]]
@@ -294,7 +294,7 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 false)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 5
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP20]]
@@ -357,7 +357,7 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP11]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -420,7 +420,7 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP7]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -485,7 +485,7 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -549,7 +549,7 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -674,7 +674,7 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -739,7 +739,7 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -801,7 +801,7 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -861,7 +861,7 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP15]], i1 false)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP9]]
@@ -922,7 +922,7 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
@@ -987,7 +987,7 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1122,7 +1122,7 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1189,7 +1189,7 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -1258,7 +1258,7 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP17]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 3, [[TMP11]]
@@ -1330,7 +1330,7 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i64 [[TMP0]], 1
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP19]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 [[START]], [[TMP12]]
@@ -1401,7 +1401,7 @@ define i64 @loop_contains_safe_call() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -1463,7 +1463,7 @@ define i64 @loop_contains_safe_div() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -1526,7 +1526,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
@@ -1594,7 +1594,7 @@ define i64 @same_exit_block_pre_inc_use1_reverse() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP8]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = sub i64 1023, [[TMP12]]
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
@@ -1719,7 +1719,7 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 8da1dca52e87b..ef4d5c6d66700 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -127,7 +127,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
 ; VF8UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
-; VF8UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
+; VF8UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 false)
 ; VF8UF1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; VF8UF1-NEXT:    br label %[[EXIT]]
 ; VF8UF1:       [[EXIT]]:
@@ -156,9 +156,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
-; VF8UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 false)
 ; VF8UF2-NEXT:    [[TMP7:%.*]] = add i64 8, [[TMP5]]
-; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP1]], i1 true)
+; VF8UF2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP1]], i1 false)
 ; VF8UF2-NEXT:    [[TMP9:%.*]] = add i64 0, [[TMP8]]
 ; VF8UF2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP8]], 8
 ; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[TMP7]]
@@ -185,7 +185,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
-; VF16UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 true)
+; VF16UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 false)
 ; VF16UF1-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
 ; VF16UF1-NEXT:    br label %[[EXIT]]
 ; VF16UF1:       [[EXIT]]:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
index aea9a80ba6dd0..a727973b43511 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -28,7 +28,7 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK:       [[MIDDLE_SPLIT]]:
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 false)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 [[TMP7]]
@@ -149,13 +149,14 @@ define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[LOOP_HEADER_I_PREHEADER2:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP3]], -8
-; CHECK:         [[TMP9:%.*]] = getelementptr
-; CHECK-NEXT:         br label %[[VECTOR_BODY:.*]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[XTRAITER]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[PROL_ITER_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX1]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP1]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD_FR:%.*]] = freeze <8 x i16> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD_FR]], splat (i16 1)
 ; CHECK-NEXT:    [[PROL_ITER_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -170,10 +171,10 @@ define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[XTRAITER]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I_PREHEADER2]]
 ; CHECK:       [[LOOP_HEADER_I_PREHEADER2]]:
-; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PTR_IV_I_PH:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[NEXT_GEP]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER_I:.*]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP4]], i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP12]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s
new file mode 100644
index 0000000000000..d7afebe6e5e55
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s
@@ -0,0 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+
+//  INSTS=
+//      v_ceil_f32 OPS32
+//      v_cos_f32 OPS32
+//
+//  SRC32=
+//      v1    # A comment.
+//      0.5
+//
+//  OPS32=
+//      v5, SRC32
+//      v255, 0xaf123456
+
+v_bfrev_b32 v5, v1
+// GFX11: v_bfrev_b32_e32 v5, v1                  ; encoding: [0x01,0x71,0x0a,0x7e]
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected
new file mode 100644
index 0000000000000..21ee43a8a06a3
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu-templates.s.expected
@@ -0,0 +1,32 @@
+// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,+wavefrontsize32 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
+
+//  INSTS=
+//      v_ceil_f32 OPS32
+//      v_cos_f32 OPS32
+//
+//  SRC32=
+//      v1    # A comment.
+//      0.5
+//
+//  OPS32=
+//      v5, SRC32
+//      v255, 0xaf123456
+
+v_ceil_f32 v5, v1
+// GFX11: v_ceil_f32_e32 v5, v1                   ; encoding: [0x01,0x45,0x0a,0x7e]
+
+v_ceil_f32 v5, 0.5
+// GFX11: v_ceil_f32_e32 v5, 0.5                  ; encoding: [0xf0,0x44,0x0a,0x7e]
+
+v_ceil_f32 v255, 0xaf123456
+// GFX11: v_ceil_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x44,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+
+v_cos_f32 v5, v1
+// GFX11: v_cos_f32_e32 v5, v1                    ; encoding: [0x01,0x6d,0x0a,0x7e]
+
+v_cos_f32 v5, 0.5
+// GFX11: v_cos_f32_e32 v5, 0.5                   ; encoding: [0xf0,0x6c,0x0a,0x7e]
+
+v_cos_f32 v255, 0xaf123456
+// GFX11: v_cos_f32_e32 v255, 0xaf123456          ; encoding: [0xff,0x6c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test
new file mode 100644
index 0000000000000..6dfdb985d8cdb
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/amdgpu-templates.test
@@ -0,0 +1,5 @@
+# REQUIRES: amdgpu-registered-target
+## Test expanding instruction templates.
+
+# RUN: cp -f %S/Inputs/amdgpu-templates.s %t.s && %update_mc_test_checks %t.s
+# RUN: diff -u %S/Inputs//amdgpu-templates.s.expected %t.s
diff --git a/llvm/utils/lit/examples/many-tests/ManyTests.py b/llvm/utils/lit/examples/many-tests/ManyTests.py
index 89e818a037c39..ffdbbad5a77b1 100644
--- a/llvm/utils/lit/examples/many-tests/ManyTests.py
+++ b/llvm/utils/lit/examples/many-tests/ManyTests.py
@@ -1,4 +1,5 @@
-from lit import Test, TestFormat
+from lit import Test
+from lit.formats import TestFormat
 
 
 class ManyTests(TestFormat):
diff --git a/llvm/utils/update_mc_test_checks.py b/llvm/utils/update_mc_test_checks.py
index 363278d1b1f97..9b80267e8ad8c 100755
--- a/llvm/utils/update_mc_test_checks.py
+++ b/llvm/utils/update_mc_test_checks.py
@@ -29,6 +29,11 @@
 ]
 
 
+class Error(Exception):
+    def __init__(self, test_info, line_no, msg):
+        super().__init__(f"{test_info.path}:{line_no}: {msg}")
+
+
 def invoke_tool(exe, check_rc, cmd_args, testline, verbose=False):
     substs = SUBSTITUTIONS + [(t, exe) for t in mc_LIKE_TOOLS]
     args = [common.applySubstitutions(cmd, substs) for cmd in cmd_args.split("|")]
@@ -125,6 +130,62 @@ def getErrCheckLine(prefix, output, mc_mode, line_offset=1):
     )
 
 
+def parse_token_defs(test_info):
+    tokens = {}
+    current_token = None
+    for line_no, line in enumerate(test_info.input_lines, start=1):
+        # Remove comments.
+        line = line.split("#")[0].rstrip()
+
+        # Skip everything up to the instructions definition.
+        if not tokens and not current_token and line != "//  INSTS=":
+            continue
+
+        if not line.startswith("//"):
+            break
+
+        original_len = len(line)
+        line = line[2:].lstrip(" ")
+        indent = original_len - len(line)
+
+        if not line:
+            current_token = None
+            continue
+
+        # Define a new token.
+        if not current_token:
+            if indent != 4 or not line.endswith("="):
+                raise Error(test_info, line_no, "token definition expected")
+
+            current_token = line[:-1].strip()
+            if current_token in tokens:
+                raise Error(test_info, line_no, f"'{current_token}' redefined")
+
+            tokens[current_token] = []
+            continue
+
+        # Add token value.
+        if indent != 8:
+            raise Error(test_info, line_no, "wrong indentation for token value")
+
+        tokens[current_token].append(line)
+
+    return tokens
+
+
+def expand_insts(tokens):
+    def subst(s):
+        for token, values in tokens.items():
+            if token in s:
+                for value in values:
+                    yield from subst(s.replace(token, value, 1))
+                return
+
+        yield s
+
+    yield from subst("INSTS")
+
+
 def update_test(ti: common.TestInfo):
     if ti.path.endswith(".s"):
         mc_mode = "asm"
@@ -209,6 +270,14 @@ def update_test(ti: common.TestInfo):
     testlines = list(dict.fromkeys(testlines))
     common.debug("Valid test line found: ", len(testlines))
 
+    # Where instruction templates are specified, use them instead.
+    use_asm_templates = False
+    if mc_mode == "asm":
+        tokens = parse_token_defs(ti)
+        if "INSTS" in tokens:
+            testlines = list(expand_insts(tokens))
+            use_asm_templates = True
+
     raw_output = []
     raw_prefixes = []
     for (
@@ -244,7 +313,6 @@ def update_test(ti: common.TestInfo):
 
         raw_prefixes.append(prefixes)
 
-    output_lines = []
     generated_prefixes = {}
     sort_keys = {}
     used_prefixes = set()
@@ -321,14 +389,32 @@ def update_test(ti: common.TestInfo):
         generated_prefixes[input_line] = "\n".join(check_lines)
 
     # write output
-    for input_info in ti.iterlines(output_lines):
-        input_line = input_info.line
-        if input_line in testlines:
-            output_lines.append(input_line)
-            output_lines.append(generated_prefixes[input_line])
-
-        elif should_add_line_to_output(input_line, prefix_set, mc_mode):
-            output_lines.append(input_line)
+    output_lines = []
+    if use_asm_templates:
+        # Keep all leading comments and empty lines.
+        for input_info in ti.iterlines(output_lines):
+            input_line = input_info.line
+            if not input_line or input_line.startswith(COMMENT[mc_mode]):
+                output_lines.append(input_line)
+                continue
+            break
+
+        # Remove tail empty lines.
+        while not output_lines[-1]:
+            del output_lines[-1]
+
+        # Emit test and check lines.
+        for input_line in testlines:
+            output_lines.extend(["", input_line, generated_prefixes[input_line]])
+    else:
+        for input_info in ti.iterlines(output_lines):
+            input_line = input_info.line
+            if input_line in testlines:
+                output_lines.append(input_line)
+                output_lines.append(generated_prefixes[input_line])
+
+            elif should_add_line_to_output(input_line, prefix_set, mc_mode):
+                output_lines.append(input_line)
 
     if ti.args.unique or ti.args.sort:
         # split with double newlines
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 214410f78e51c..3667fdb2bb728 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -347,28 +347,55 @@ LogicalResult LoadOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
           loadOp.getLoc(), rewriter, expandShapeOp, indices, sourceIndices,
           isa<affine::AffineLoadOp, memref::LoadOp>(loadOp.getOperation()))))
     return failure();
-  llvm::TypeSwitch<Operation *, void>(loadOp)
+
+  return llvm::TypeSwitch<Operation *, LogicalResult>(loadOp)
       .Case([&](affine::AffineLoadOp op) {
         rewriter.replaceOpWithNewOp<affine::AffineLoadOp>(
             loadOp, expandShapeOp.getViewSource(), sourceIndices);
+        return success();
       })
       .Case([&](memref::LoadOp op) {
         rewriter.replaceOpWithNewOp<memref::LoadOp>(
             loadOp, expandShapeOp.getViewSource(), sourceIndices,
             op.getNontemporal());
+        return success();
       })
       .Case([&](vector::LoadOp op) {
         rewriter.replaceOpWithNewOp<vector::LoadOp>(
             op, op.getType(), expandShapeOp.getViewSource(), sourceIndices,
             op.getNontemporal());
+        return success();
       })
       .Case([&](vector::MaskedLoadOp op) {
         rewriter.replaceOpWithNewOp<vector::MaskedLoadOp>(
             op, op.getType(), expandShapeOp.getViewSource(), sourceIndices,
             op.getMask(), op.getPassThru());
+        return success();
+      })
+      .Case([&](vector::TransferReadOp op) {
+        // We only support minor identity maps in the permutation attribute.
+        if (!op.getPermutationMap().isMinorIdentity())
+          return failure();
+
+        // We only support the case where the source of the expand shape has
+        // rank greater than or equal to the vector rank.
+        const int64_t sourceRank = sourceIndices.size();
+        const int64_t vectorRank = op.getVectorType().getRank();
+        if (sourceRank < vectorRank)
+          return failure();
+
+        // We need to construct a new minor identity map since we will have lost
+        // some dimensions in folding away the expand shape.
+        auto minorIdMap = AffineMap::getMinorIdentityMap(sourceRank, vectorRank,
+                                                         op.getContext());
+
+        rewriter.replaceOpWithNewOp<vector::TransferReadOp>(
+            op, op.getVectorType(), expandShapeOp.getViewSource(),
+            sourceIndices, minorIdMap, op.getPadding(), op.getMask(),
+            op.getInBounds());
+        return success();
       })
       .DefaultUnreachable("unexpected operation");
-  return success();
 }
 
 template <typename OpTy>
@@ -659,6 +686,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
                LoadOpOfExpandShapeOpFolder<memref::LoadOp>,
                LoadOpOfExpandShapeOpFolder<vector::LoadOp>,
                LoadOpOfExpandShapeOpFolder<vector::MaskedLoadOp>,
+               LoadOpOfExpandShapeOpFolder<vector::TransferReadOp>,
                StoreOpOfExpandShapeOpFolder<affine::AffineStoreOp>,
                StoreOpOfExpandShapeOpFolder<memref::StoreOp>,
                StoreOpOfExpandShapeOpFolder<vector::StoreOp>,
diff --git a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
index a26edac98ea8d..2986f4c2d607d 100644
--- a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
+++ b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
@@ -106,14 +106,12 @@ ScalableValueBoundsConstraintSet::computeScalableBound(
 
   AffineMap bound = [&] {
     if (boundType == BoundType::EQ && !invalidBound(lowerBound) &&
-        lowerBound[0] == upperBound[0]) {
+        lowerBound[0] == upperBound[0])
       return lowerBound[0];
-    }
-    if (boundType == BoundType::LB && !invalidBound(lowerBound)) {
+    if (boundType == BoundType::LB && !invalidBound(lowerBound))
       return lowerBound[0];
-    } else if (boundType == BoundType::UB && !invalidBound(upperBound)) {
+    if (boundType == BoundType::UB && !invalidBound(upperBound))
       return upperBound[0];
-    }
     return AffineMap{};
   }();
 
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 106652623933f..ca91b0141f593 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -992,6 +992,55 @@ func.func @fold_vector_maskedstore_expand_shape(
 
 // -----
 
+func.func @fold_vector_transfer_read_expand_shape(
+  %arg0 : memref<32xf32>, %arg1 : index) -> vector<8xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = ub.poison : f32
+  %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32>
+  %1 = vector.transfer_read %0[%arg1, %c0], %pad {in_bounds = [true]} : memref<4x8xf32>, vector<8xf32>
+  return %1 : vector<8xf32>
+}
+
+// CHECK-LABEL: func @fold_vector_transfer_read_expand_shape
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[C0:.*]] = arith.constant 0
+//       CHECK:   %[[PAD:.*]] = ub.poison : f32
+//       CHECK:   %[[IDX:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (4, 8)
+//       CHECK:   vector.transfer_read %[[ARG0]][%[[IDX]]], %[[PAD]] {in_bounds = [true]}
+
+// -----
+
+func.func @fold_vector_transfer_read_with_perm_map(
+  %arg0 : memref<32xf32>, %arg1 : index) -> vector<4x4xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = ub.poison : f32
+  %0 = memref.expand_shape %arg0 [[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32>
+  %1 = vector.transfer_read %0[%arg1, %c0], %pad { permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<4x8xf32>, vector<4x4xf32>
+  return %1 : vector<4x4xf32>
+}
+
+// CHECK-LABEL: func @fold_vector_transfer_read_with_perm_map
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32>
+//       CHECK:   memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] output_shape [4, 8] : memref<32xf32> into memref<4x8xf32>
+
+// -----
+
+func.func @fold_vector_transfer_read_rank_mismatch(
+  %arg0 : memref<32xf32>, %arg1 : index) -> vector<4x4xf32> {
+  %c0 = arith.constant 0 : index
+  %pad = ub.poison : f32
+  %0 = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [2, 4, 4] : memref<32xf32> into memref<2x4x4xf32>
+  %1 = vector.transfer_read %0[%arg1, %c0, %c0], %pad {in_bounds = [true, true]} : memref<2x4x4xf32>, vector<4x4xf32>
+  return %1 : vector<4x4xf32>
+}
+
+// CHECK-LABEL: func @fold_vector_transfer_read_rank_mismatch
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<32xf32>
+//       CHECK:   memref.expand_shape %[[ARG0]] {{\[}}[0, 1, 2]] output_shape [2, 4, 4] : memref<32xf32> into memref<2x4x4xf32>
+
+// -----
+
 func.func @fold_vector_load_collapse_shape(
   %arg0 : memref<4x8xf32>, %arg1 : index) -> vector<8xf32> {
   %0 = memref.collapse_shape %arg0 [[0, 1]] : memref<4x8xf32> into memref<32xf32>