diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 69ae4fb8ddcc9..f543d2e5d79a1 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -632,6 +632,12 @@ class MCPlusBuilder {
     return false;
   }
 
+  /// Generate the matching pointer authentication instruction from a fused
+  /// pauth-and-return instruction.
+  virtual void createMatchingAuth(const MCInst &AuthAndRet, MCInst &Auth) {
+    llvm_unreachable("not implemented");
+  }
+
   /// Returns the register used as a return address. Returns std::nullopt if
   /// not applicable, such as reading the return address from a system register
   /// or from the stack.
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 9b28c7efde5bf..5a7d02a34b4d8 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -195,6 +195,13 @@ InliningInfo getInliningInfo(const BinaryFunction &BF) {
         if (BC.MIB->isPush(Inst) || BC.MIB->isPop(Inst))
           continue;
 
+        // Pointer signing and authenticatin instructions are used around
+        // Push and Pop. These are also straightforward to handle.
+        if (BC.isAArch64() &&
+            (BC.MIB->isPSignOnLR(Inst) || BC.MIB->isPAuthOnLR(Inst) ||
+             BC.MIB->isPAuthAndRet(Inst)))
+          continue;
+
         DirectSP |= BC.MIB->hasDefOfPhysReg(Inst, SPReg) ||
                     BC.MIB->hasUseOfPhysReg(Inst, SPReg);
       }
@@ -338,6 +345,18 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
                                 BC.Ctx.get());
       }
 
+      // Handling fused authentication and return instructions (Armv8.3-A):
+      // if the Callee does not end in a tailcall, the return will be removed
+      // from the inlined block. If that return is RETA(A|B), we have to keep
+      // the authentication part.
+      // RETAA -> AUTIASP
+      // RETAB -> AUTIBSP
+      if (!CSIsTailCall && BC.isAArch64() && BC.MIB->isPAuthAndRet(Inst)) {
+        MCInst Auth;
+        BC.MIB->createMatchingAuth(Inst, Auth);
+        InsertII =
+            std::next(InlinedBB->insertInstruction(InsertII, std::move(Auth)));
+      }
       if (CSIsTailCall || (!MIB.isCall(Inst) && !MIB.isReturn(Inst))) {
         InsertII =
             std::next(InlinedBB->insertInstruction(InsertII, std::move(Inst)));
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index db3989d6b0b5f..d4fd4b78f382d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -313,6 +313,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
            Inst.getOpcode() == AArch64::RETABSPPCr;
   }
 
+  void createMatchingAuth(const MCInst &AuthAndRet, MCInst &Auth) override {
+    Auth.clear();
+    Auth.setOperands(AuthAndRet.getOperands());
+    switch (AuthAndRet.getOpcode()) {
+    case AArch64::RETAA:
+      Auth.setOpcode(AArch64::AUTIASP);
+      break;
+    case AArch64::RETAB:
+      Auth.setOpcode(AArch64::AUTIBSP);
+      break;
+    case AArch64::RETAASPPCi:
+      Auth.setOpcode(AArch64::AUTIASPPCi);
+      break;
+    case AArch64::RETABSPPCi:
+      Auth.setOpcode(AArch64::AUTIBSPPCi);
+      break;
+    case AArch64::RETAASPPCr:
+      Auth.setOpcode(AArch64::AUTIASPPCr);
+      break;
+    case AArch64::RETABSPPCr:
+      Auth.setOpcode(AArch64::AUTIBSPPCr);
+      break;
+    default:
+      llvm_unreachable("Unhandled fused pauth-and-return instruction");
+    }
+  }
+
   std::optional<MCPhysReg> getSignedReg(const MCInst &Inst) const override {
     switch (Inst.getOpcode()) {
     case AArch64::PACIA:
diff --git a/bolt/test/AArch64/inline-armv8.3-returns.s b/bolt/test/AArch64/inline-armv8.3-returns.s
new file mode 100644
index 0000000000000..055b589476caf
--- /dev/null
+++ b/bolt/test/AArch64/inline-armv8.3-returns.s
@@ -0,0 +1,45 @@
+# This test checks that inlining functions with fused pointer-auth-and-return
+# instructions is properly handled by BOLT.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown -mattr=+v8.3a %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-all --print-inline  --print-only=_Z3barP1A  \
+# RUN: %t.exe -o %t.bolt  | FileCheck %s
+
+# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 8 bytes.
+# CHECK: Binary Function "_Z3barP1A" after inlining {
+# CHECK-NOT: bl	_Z3fooP1A
+# CHECK: ldr	x8, [x0]
+# CHECK-NEXT: ldr	w0, [x8]
+# CHECK-NEXT: autiasp
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+    paciasp
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	retaa
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_Z3fooP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-armv8.3-tailcall.s b/bolt/test/AArch64/inline-armv8.3-tailcall.s
new file mode 100644
index 0000000000000..78e7285fbf101
--- /dev/null
+++ b/bolt/test/AArch64/inline-armv8.3-tailcall.s
@@ -0,0 +1,46 @@
+# This test checks that inlining functions with fused pointer-auth-and-return
+# instructions into a location with a tailcall is properly handled by BOLT.
+# Because _Z3barP1A ends in a tailcall, we don't remove the return instruction
+# from the inlined block. Therefore, we should see a retaa, and not an autiasp.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown -mattr=+v8.3a %s -o %t.o
+# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-all --print-inline  --print-only=_Z3barP1A  \
+# RUN: %t.exe -o %t.bolt  | FileCheck %s
+
+# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 12 bytes.
+# CHECK: Binary Function "_Z3barP1A" after inlining {
+# CHECK-NOT: bl	_Z3fooP1A
+# CHECK:      mov	x29, sp
+# CHECK-NEXT: paciasp
+# CHECK-NEXT: ldr	x8, [x0]
+# CHECK-NEXT: ldr	w0, [x8]
+# CHECK-NEXT: retaa
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+    paciasp
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	retaa
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	b _Z3fooP1A // tailcall
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/bolt/test/AArch64/inline-pauth-lr.s b/bolt/test/AArch64/inline-pauth-lr.s
new file mode 100644
index 0000000000000..34f05721d5ea0
--- /dev/null
+++ b/bolt/test/AArch64/inline-pauth-lr.s
@@ -0,0 +1,61 @@
+# This test checks that inlining functions with the pauth-lr variants of
+# fused pointer-auth-and-return instructions is properly handled by BOLT.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -march=armv9.5-a+pauth-lr -O0 %s -o %t.exe -Wl,-q
+# RUN: llvm-bolt --inline-all --print-inline  --print-only=_Z3barP1A  \
+# RUN: %t.exe -o %t.bolt  | FileCheck %s
+
+# CHECK: BOLT-INFO: inlined 0 calls at 2 call sites in 2 iteration(s). Change in binary size: 16 bytes.
+# CHECK: Binary Function "_Z3barP1A" after inlining {
+# CHECK-NOT:  bl	_Z3fooP1A
+# CHECK:      paciasppc
+# CHECK-NEXT: ldr	x8, [x0]
+# CHECK-NEXT: ldr	w0, [x8]
+# CHECK-NEXT: autiasppcr x28
+# CHECK-NEXT: paciasppc
+# CHECK-NEXT: ldr	x7, [x0]
+# CHECK-NEXT: ldr	w0, [x7]
+# CHECK-NEXT: autiasppc _Z3bazP1A
+
+	.text
+	.globl	_Z3fooP1A
+	.type	_Z3fooP1A,@function
+_Z3fooP1A:
+    paciasppc
+	ldr	x8, [x0]
+	ldr	w0, [x8]
+	retaasppcr x28
+	.size	_Z3fooP1A, .-_Z3fooP1A
+
+	.text
+	.globl	_Z3bazP1A
+	.type	_Z3bazP1A,@function
+_Z3bazP1A:
+0:
+    paciasppc
+	ldr	x7, [x0]
+	ldr	w0, [x7]
+	retaasppc 0b
+	.size	_Z3bazP1A, .-_Z3bazP1A
+
+	.globl	_Z3barP1A
+	.type	_Z3barP1A,@function
+_Z3barP1A:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	bl	_Z3fooP1A
+	bl	_Z3bazP1A
+	mul	w0, w0, w0
+	ldp	x29, x30, [sp], #16
+	ret
+	.size	_Z3barP1A, .-_Z3barP1A
+
+	.globl	main
+	.p2align	2
+	.type	main,@function
+main:
+	mov	w0, wzr
+	ret
+	.size	main, .-main
diff --git a/clang-tools-extra/clang-doc/assets/class-template.mustache b/clang-tools-extra/clang-doc/assets/class-template.mustache
index c5187026a2399..bbac3ffae02fb 100644
--- a/clang-tools-extra/clang-doc/assets/class-template.mustache
+++ b/clang-tools-extra/clang-doc/assets/class-template.mustache
@@ -140,6 +140,7 @@
             <section class="hero section-container">
                 <div class="hero__title">
                     <h1 class="hero__title-large">{{TagType}} {{Name}}</h1>
+                    <p>Defined at line {{Location.LineNumber}} of file {{Location.Filename}}</p>
                     {{#Description}}
                     <div class="hero__subtitle">
                         {{>Comments}}
diff --git a/clang-tools-extra/clangd/SemanticSelection.cpp b/clang-tools-extra/clangd/SemanticSelection.cpp
index c2dad53bcec6b..dc61a40d733f8 100644
--- a/clang-tools-extra/clangd/SemanticSelection.cpp
+++ b/clang-tools-extra/clangd/SemanticSelection.cpp
@@ -222,6 +222,9 @@ class PragmaRegionFinder {
   }
 
   void operator()(const DirectiveTree::Conditional &C) {
+    // C.Branches needs to see the DirectiveTree definition, otherwise build
+    // fails in C++20.
+    [[maybe_unused]] DirectiveTree Dummy;
     for (const auto &[_, SubTree] : C.Branches)
       walk(SubTree);
   }
diff --git a/clang-tools-extra/test/clang-doc/namespace.cpp b/clang-tools-extra/test/clang-doc/namespace.cpp
index 4fa11c78d37c0..adf7ab7d946ab 100644
--- a/clang-tools-extra/test/clang-doc/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/namespace.cpp
@@ -1,6 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --output=%t --executor=standalone %s
+// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s
 // RUN: FileCheck %s < %t/index_json.js -check-prefix=JSON-INDEX
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS-LINE
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS
@@ -38,19 +39,41 @@
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md -check-prefix=MD-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/all_files.md -check-prefix=MD-ALL-FILES
 // RUN: FileCheck %s < %t/index.md -check-prefix=MD-INDEX
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS
+
+// COM: FIXME: Add global functions to the namespace template
+// COM: FIXME: Add namespaces to the namespace template
 
 // Anonymous Namespace
 namespace {
 void anonFunction() {}
 // MD-ANON-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANON-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 class AnonClass {};
 // MD-ANON-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANON-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-ANON-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANON-CLASS: # class AnonClass
 // HTML-ANON-CLASS: <h1>class AnonClass</h1>
+// MUSTACHE-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
 } // namespace
 
 // MD-ANON-INDEX: # namespace @nonymous_namespace
@@ -69,17 +92,31 @@ class AnonClass {};
 // HTML-ANON-INDEX: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
 // HTML-ANON-INDEX: <p>void anonFunction()</p>
 
+// MUSTACHE-ANON-INDEX: <h2> @nonymous_namespace</h2>
+// MUSTACHE-ANON-INDEX:     <h2>Inner Classes</h2>
+// MUSTACHE-ANON-INDEX:         <ul class="class-container">
+// MUSTACHE-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// MUSTACHE-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
+// MUSTACHE-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
+// MUSTACHE-ANON-INDEX:                 </a>
+// MUSTACHE-ANON-INDEX:             </li>
+// MUSTACHE-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// MUSTACHE-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
+// MUSTACHE-ANON-INDEX-NOT: <p>void anonFunction()</p>
+
 // Primary Namespace
 namespace PrimaryNamespace {
 // Function in PrimaryNamespace
 void functionInPrimaryNamespace() {}
 // MD-PRIMARY-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-PRIMARY-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in PrimaryNamespace
 class ClassInPrimaryNamespace {};
 // MD-PRIMARY-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-PRIMARY-CLASS: # class ClassInPrimaryNamespace
 // MD-PRIMARY-CLASS: Class in PrimaryNamespace
@@ -87,23 +124,29 @@ class ClassInPrimaryNamespace {};
 // HTML-PRIMARY-CLASS: <h1>class ClassInPrimaryNamespace</h1>
 // HTML-PRIMARY-CLASS: <p> Class in PrimaryNamespace</p>
 
+// MUSTACHE-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
+
 // Nested namespace
 namespace NestedNamespace {
 // Function in NestedNamespace
 void functionInNestedNamespace() {}
 // MD-NESTED-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-NESTED-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in NestedNamespace
 class ClassInNestedNamespace {};
 // MD-NESTED-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-NESTED-CLASS: # class ClassInNestedNamespace
 // MD-NESTED-CLASS: Class in NestedNamespace
 
 // HTML-NESTED-CLASS: <h1>class ClassInNestedNamespace</h1>
 // HTML-NESTED-CLASS: <p> Class in NestedNamespace</p>
+
+// MUSTACHE-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
 } // namespace NestedNamespace
 
 // MD-NESTED-INDEX: # namespace NestedNamespace
@@ -123,6 +166,20 @@ class ClassInNestedNamespace {};
 // HTML-NESTED-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
 // HTML-NESTED-INDEX: <p>void functionInNestedNamespace()</p>
 // HTML-NESTED-INDEX: <p> Function in NestedNamespace</p>
+
+// MUSTACHE-NESTED-INDEX: <h2> NestedNamespace</h2>
+// MUSTACHE-NESTED-INDEX:     <h2>Inner Classes</h2>
+// MUSTACHE-NESTED-INDEX:     <ul class="class-container">
+// MUSTACHE-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// MUSTACHE-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
+// MUSTACHE-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
+// MUSTACHE-NESTED-INDEX:             </a>
+// MUSTACHE-NESTED-INDEX:         </li>
+// MUSTACHE-NESTED-INDEX:     </ul>
+// MUSTACHE-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// MUSTACHE-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
+// MUSTACHE-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
+// MUSTACHE-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
 } // namespace PrimaryNamespace
 
 // MD-PRIMARY-INDEX: # namespace PrimaryNamespace
@@ -147,17 +204,35 @@ class ClassInNestedNamespace {};
 // HTML-PRIMARY-INDEX: <p>void functionInPrimaryNamespace()</p>
 // HTML-PRIMARY-INDEX: <p> Function in PrimaryNamespace</p>
 
+// MUSTACHE-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
+// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// MUSTACHE-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
+// MUSTACHE-PRIMARY-INDEX      <h2>Inner Classes</h2>
+// MUSTACHE-PRIMARY-INDEX          <ul class="class-container">
+// MUSTACHE-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// MUSTACHE-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
+// MUSTACHE-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
+// MUSTACHE-PRIMARY-INDEX                  </a>
+// MUSTACHE-PRIMARY-INDEX              </li>
+// MUSTACHE-PRIMARY-INDEX          </ul>
+// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// MUSTACHE-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
+// MUSTACHE-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
+// MUSTACHE-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
+
 // AnotherNamespace
 namespace AnotherNamespace {
 // Function in AnotherNamespace
 void functionInAnotherNamespace() {}
 // MD-ANOTHER-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANOTHER-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in AnotherNamespace
 class ClassInAnotherNamespace {};
 // MD-ANOTHER-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// MUSTACHE-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANOTHER-CLASS: # class ClassInAnotherNamespace
 // MD-ANOTHER-CLASS:  Class in AnotherNamespace
@@ -165,6 +240,8 @@ class ClassInAnotherNamespace {};
 // HTML-ANOTHER-CLASS: <h1>class ClassInAnotherNamespace</h1>
 // HTML-ANOTHER-CLASS: <p> Class in AnotherNamespace</p>
 
+// MUSTACHE-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
+
 } // namespace AnotherNamespace
 
 // MD-ANOTHER-INDEX: # namespace AnotherNamespace
@@ -185,6 +262,20 @@ class ClassInAnotherNamespace {};
 // HTML-ANOTHER-INDEX: <p>void functionInAnotherNamespace()</p>
 // HTML-ANOTHER-INDEX: <p> Function in AnotherNamespace</p>
 
+// MUSTACHE-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
+// MUSTACHE-ANOTHER-INDEX:     <h2>Inner Classes</h2>
+// MUSTACHE-ANOTHER-INDEX:     <ul class="class-container">
+// MUSTACHE-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// MUSTACHE-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
+// MUSTACHE-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
+// MUSTACHE-ANOTHER-INDEX:             </a>
+// MUSTACHE-ANOTHER-INDEX:         </li>
+// MUSTACHE-ANOTHER-INDEX:     </ul>
+// MUSTACHE-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// MUSTACHE-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
+// MUSTACHE-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
+// MUSTACHE-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
+
 // JSON-INDEX: async function LoadIndex() {
 // JSON-INDEX-NEXT: return{
 // JSON-INDEX-NEXT:   "USR": "{{([0-9A-F]{40})}}",
@@ -270,6 +361,13 @@ class ClassInAnotherNamespace {};
 // HTML-GLOBAL-INDEX: <li>AnotherNamespace</li>
 // HTML-GLOBAL-INDEX: <li>PrimaryNamespace</li>
 
+// MUSTACHE-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
+// MUSTACHE-GLOBAL-INDEX: <h1>Global Namespace</h1>
+// MUSTACHE-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
+// MUSTACHE-GLOBAL-INDEX: <li>@nonymous_namespace</li>
+// MUSTACHE-GLOBAL-INDEX: <li>AnotherNamespace</li>
+// MUSTACHE-GLOBAL-INDEX: <li>PrimaryNamespace</li>
+
 // MD-GLOBAL-INDEX: # Global Namespace
 // MD-GLOBAL-INDEX: ## Namespaces
 // MD-GLOBAL-INDEX: * [@nonymous_namespace](..{{[\/]}}@nonymous_namespace{{[\/]}}index.md)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dfef341c0e257..3b67ee3819507 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -734,6 +734,8 @@ Bug Fixes in This Version
 - Accept empty enumerations in MSVC-compatible C mode. (#GH114402)
 - Fix a bug leading to incorrect code generation with complex number compound assignment and bitfield values, which also caused a crash with UBsan. (#GH166798)
 - Fixed false-positive shadow diagnostics for lambdas in explicit object member functions. (#GH163731)
+- Fix an assertion failure when a ``target_clones`` attribute is only on the
+  forward declaration of a multiversioned function. (#GH165517) (#GH129483)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index c99fd6f0bfcc4..7321bf4ea8963 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -240,7 +240,6 @@ struct MissingFeatures {
   static bool ctorConstLvalueToRvalueConversion() { return false; }
   static bool ctorMemcpyizer() { return false; }
   static bool cudaSupport() { return false; }
-  static bool cxxRecordStaticMembers() { return false; }
   static bool dataLayoutTypeIsSized() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool dataLayoutTypeStoreSize() { return false; }
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 555aa5c050ffd..591457b1d66b4 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1790,7 +1790,9 @@ void NamedDecl::printNestedNameSpecifier(raw_ostream &OS,
       else
         OS << *ND;
     } else if (const auto *RD = dyn_cast<RecordDecl>(DC)) {
-      if (!RD->getIdentifier())
+      if (TypedefNameDecl *TD = RD->getTypedefNameForAnonDecl())
+        OS << *TD;
+      else if (!RD->getIdentifier())
         OS << "(anonymous " << RD->getKindName() << ')';
       else
         OS << *RD;
diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
index 40888e7326659..41a193e4d85c5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp
@@ -19,12 +19,9 @@ using namespace clang::CIRGen;
 
 namespace {
 struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
-  SourceRange declareRange;
   mlir::acc::DeclareEnterOp enterOp;
 
-  OpenACCDeclareCleanup(SourceRange declareRange,
-                        mlir::acc::DeclareEnterOp enterOp)
-      : declareRange(declareRange), enterOp(enterOp) {}
+  OpenACCDeclareCleanup(mlir::acc::DeclareEnterOp enterOp) : enterOp(enterOp) {}
 
   template <typename OutTy, typename InTy>
   void createOutOp(CIRGenFunction &cgf, InTy inOp) {
@@ -55,8 +52,8 @@ struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
       if (auto copyin = val.getDefiningOp<mlir::acc::CopyinOp>()) {
         switch (copyin.getDataClause()) {
         default:
-          cgf.cgm.errorNYI(declareRange,
-                           "OpenACC local declare clause copyin cleanup");
+          llvm_unreachable(
+              "OpenACC local declare clause copyin unexpected data clause");
           break;
         case mlir::acc::DataClause::acc_copy:
           createOutOp<mlir::acc::CopyoutOp>(cgf, copyin);
@@ -65,6 +62,24 @@ struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
           createOutOp<mlir::acc::DeleteOp>(cgf, copyin);
           break;
         }
+      } else if (auto create = val.getDefiningOp<mlir::acc::CreateOp>()) {
+        switch (create.getDataClause()) {
+        default:
+          llvm_unreachable(
+              "OpenACC local declare clause create unexpected data clause");
+          break;
+        case mlir::acc::DataClause::acc_copyout:
+          createOutOp<mlir::acc::CopyoutOp>(cgf, create);
+          break;
+        case mlir::acc::DataClause::acc_create:
+          createOutOp<mlir::acc::DeleteOp>(cgf, create);
+          break;
+        }
+      } else if (auto present = val.getDefiningOp<mlir::acc::PresentOp>()) {
+        createOutOp<mlir::acc::DeleteOp>(cgf, present);
+      } else if (auto dev_res =
+                     val.getDefiningOp<mlir::acc::DeclareDeviceResidentOp>()) {
+        createOutOp<mlir::acc::DeleteOp>(cgf, dev_res);
       } else if (val.getDefiningOp<mlir::acc::DeclareLinkOp>()) {
         // Link has no exit clauses, and shouldn't be copied.
         continue;
@@ -72,7 +87,7 @@ struct OpenACCDeclareCleanup final : EHScopeStack::Cleanup {
         // DevicePtr has no exit clauses, and shouldn't be copied.
         continue;
       } else {
-        cgf.cgm.errorNYI(declareRange, "OpenACC local declare clause cleanup");
+        llvm_unreachable("OpenACC local declare clause unexpected defining op");
         continue;
       }
       exitOp.getDataClauseOperandsMutable().append(val);
@@ -91,7 +106,7 @@ void CIRGenFunction::emitOpenACCDeclare(const OpenACCDeclareDecl &d) {
                      d.clauses());
 
   ehStack.pushCleanup<OpenACCDeclareCleanup>(CleanupKind::NormalCleanup,
-                                             d.getSourceRange(), enterOp);
+                                             enterOp);
 }
 
 void CIRGenFunction::emitOpenACCRoutine(const OpenACCRoutineDecl &d) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e09d3de5aac9a..251c99c8cd45b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1556,10 +1556,14 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
     break;
 
   case Decl::ClassTemplateSpecialization:
-  case Decl::CXXRecord:
+  case Decl::CXXRecord: {
+    CXXRecordDecl *crd = cast<CXXRecordDecl>(decl);
     assert(!cir::MissingFeatures::generateDebugInfo());
-    assert(!cir::MissingFeatures::cxxRecordStaticMembers());
+    for (auto *childDecl : crd->decls())
+      if (isa<VarDecl, CXXRecordDecl, EnumDecl>(childDecl))
+        emitTopLevelDecl(childDecl);
     break;
+  }
 
   case Decl::FileScopeAsm:
     // File-scope asm is ignored during device-side CUDA compilation.
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 1e7a332d1dc22..60a089fe0e936 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -853,12 +853,16 @@ class OpenACCClauseCIREmitter final
             var, mlir::acc::DataClause::acc_copyout, clause.getModifierList(),
             /*structured=*/false,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CreateOp>(
+            var, mlir::acc::DataClause::acc_copyout, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCopyOutClause");
     }
   }
 
@@ -875,12 +879,16 @@ class OpenACCClauseCIREmitter final
         addDataOperand<mlir::acc::CreateOp>(
             var, mlir::acc::DataClause::acc_create, clause.getModifierList(),
             /*structured=*/false, /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::CreateOp>(
+            var, mlir::acc::DataClause::acc_create, clause.getModifierList(),
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare construct remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitCreateClause");
     }
   }
 
@@ -976,12 +984,16 @@ class OpenACCClauseCIREmitter final
         addDataOperand<mlir::acc::PresentOp, mlir::acc::DeleteOp>(
             var, mlir::acc::DataClause::acc_present, {}, /*structured=*/true,
             /*implicit=*/false);
+    } else if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::PresentOp>(
+            var, mlir::acc::DataClause::acc_present, {},
+            /*structured=*/true,
+            /*implicit=*/false);
     } else if constexpr (isCombinedType<OpTy>) {
       applyToComputeOp(clause);
     } else {
-      // TODO: When we've implemented this for everything, switch this to an
-      // unreachable. declare remains.
-      return clauseNotImplemented(clause);
+      llvm_unreachable("Unknown construct kind in VisitPresentClause");
     }
   }
 
@@ -1123,6 +1135,18 @@ class OpenACCClauseCIREmitter final
       llvm_unreachable("Unknown construct kind in VisitReductionClause");
     }
   }
+
+  void VisitDeviceResidentClause(const OpenACCDeviceResidentClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::DeclareEnterOp>) {
+      for (const Expr *var : clause.getVarList())
+        addDataOperand<mlir::acc::DeclareDeviceResidentOp>(
+            var, mlir::acc::DataClause::acc_declare_device_resident, {},
+            /*structured=*/true,
+            /*implicit=*/false);
+    } else {
+      llvm_unreachable("Unknown construct kind in VisitDeviceResidentClause");
+    }
+  }
 };
 
 template <typename OpTy>
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5ecf68db18858..f7a1b78dad95d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11996,6 +11996,16 @@ static bool CheckMultiVersionAdditionalDecl(
     }
   }
 
+  // Redeclarations of a target_clones function may omit the attribute, in which
+  // case it will be inherited during declaration merging.
+  if (NewMVKind == MultiVersionKind::None &&
+      OldMVKind == MultiVersionKind::TargetClones) {
+    NewFD->setIsMultiVersion();
+    Redeclaration = true;
+    OldDecl = OldFD;
+    return false;
+  }
+
   // Else, this is simply a non-redecl case.  Checking the 'value' is only
   // necessary in the Target case, since The CPUSpecific/Dispatch cases are
   // handled in the attribute adding step.
@@ -12119,8 +12129,9 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
   }
 
   // At this point, we have a multiversion function decl (in OldFD) AND an
-  // appropriate attribute in the current function decl.  Resolve that these are
-  // still compatible with previous declarations.
+  // appropriate attribute in the current function decl (unless it's allowed to
+  // omit the attribute).  Resolve that these are still compatible with previous
+  // declarations.
   return CheckMultiVersionAdditionalDecl(S, OldFD, NewFD, NewCPUDisp,
                                          NewCPUSpec, NewClones, Redeclaration,
                                          OldDecl, Previous);
diff --git a/clang/test/CIR/CodeGen/static-members.cpp b/clang/test/CIR/CodeGen/static-members.cpp
new file mode 100644
index 0000000000000..8722dc2a2bc6f
--- /dev/null
+++ b/clang/test/CIR/CodeGen/static-members.cpp
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck %s -check-prefix=CIR --input-file=%t.cir
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck %s -check-prefix=LLVM --input-file=%t-cir.ll
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck %s -check-prefix=OGCG --input-file=%t.ll
+
+struct HasDtor {
+  ~HasDtor();
+};
+struct S {
+  static inline HasDtor hd;
+};
+
+// CIR: cir.global linkonce_odr comdat @_ZN1S2hdE = #cir.zero : !rec_HasDtor
+
+// CIR: cir.func internal private @__cxx_global_var_init() {
+// CIR:   %[[HD:.*]] = cir.get_global @_ZN1S2hdE : !cir.ptr<!rec_HasDtor>
+// CIR:   %[[DTOR:.*]] = cir.get_global @_ZN7HasDtorD1Ev : !cir.ptr<!cir.func<(!cir.ptr<!rec_HasDtor>)>>
+// CIR:   %[[DTOR_CAST:.*]] = cir.cast bitcast %[[DTOR]] : !cir.ptr<!cir.func<(!cir.ptr<!rec_HasDtor>)>> -> !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
+// CIR:   %[[HD_CAST:.*]] = cir.cast bitcast %[[HD]] : !cir.ptr<!rec_HasDtor> -> !cir.ptr<!void>
+// CIR:   %[[HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr<i8>
+// CIR:   cir.call @__cxa_atexit(%[[DTOR_CAST]], %[[HD_CAST]], %[[HANDLE]])
+
+// LLVM: @_ZN1S2hdE = linkonce_odr global %struct.HasDtor zeroinitializer, comdat
+// LLVM: @_ZN5Outer5Inner2hdE = linkonce_odr global %struct.HasDtor zeroinitializer, comdat
+
+// LLVM: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_static_members.cpp, ptr null }]
+// LLVM: define internal void @__cxx_global_var_init()
+// LLVM:   call void @__cxa_atexit(ptr @_ZN7HasDtorD1Ev, ptr @_ZN1S2hdE, ptr @__dso_handle)
+
+// FIXME(cir): OGCG has a guard variable for this case that we don't generate in CIR.
+//             This is needed because the variable linkonce_odr linkage.
+
+// OGCG: @_ZN1S2hdE = linkonce_odr global %struct.HasDtor zeroinitializer, comdat
+// OGCG: @_ZGVN1S2hdE = linkonce_odr global i64 0, comdat($_ZN1S2hdE)
+// OGCG: @_ZN5Outer5Inner2hdE = linkonce_odr global %struct.HasDtor zeroinitializer, comdat
+// OGCG: @_ZGVN5Outer5Inner2hdE = linkonce_odr global i64 0, comdat($_ZN5Outer5Inner2hdE)
+// OGCG: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [
+// OGCG-SAME:      { i32, ptr, ptr } { i32 65535, ptr @__cxx_global_var_init, ptr @_ZN1S2hdE },
+// OGCG-SAME:      { i32, ptr, ptr } { i32 65535, ptr @__cxx_global_var_init.1, ptr @_ZN5Outer5Inner2hdE }]
+
+// OGCG: define internal void @__cxx_global_var_init() {{.*}} section ".text.startup" comdat($_ZN1S2hdE) {
+// OGCG:   %[[GUARD:.*]] = load atomic i8, ptr @_ZGVN1S2hdE acquire
+// OGCG:   %[[UNINIT:.*]] = icmp eq i8 %[[GUARD]], 0
+// OGCG:   br i1 %[[UNINIT]], label %[[INIT_CHECK:.*]], label %[[INIT_END:.*]]
+// OGCG: [[INIT_CHECK:.*]]:
+// OGCG:   %[[GUARD_ACQUIRE:.*]] = call i32 @__cxa_guard_acquire(ptr @_ZGVN1S2hdE)
+// OGCG:   %[[TOBOOL:.*]] = icmp ne i32 %[[GUARD_ACQUIRE]], 0
+// OGCG:   br i1 %[[TOBOOL]], label %[[INIT:.*]], label %[[INIT_END]]
+// OGCG: [[INIT:.*]]:
+// OGCG:   %[[ATEXIT:.*]] = call i32 @__cxa_atexit(ptr @_ZN7HasDtorD1Ev, ptr @_ZN1S2hdE, ptr @__dso_handle)
+// OGCG:   call void @__cxa_guard_release(ptr @_ZGVN1S2hdE)
+// OGCG:   br label %[[INIT_END]]
+// OGCG: [[INIT_END]]:
+
+struct Outer {
+  struct Inner {
+    static inline HasDtor hd;
+  };
+};
+
+// CIR: cir.global linkonce_odr comdat @_ZN5Outer5Inner2hdE = #cir.zero : !rec_HasDtor
+// CIR: cir.func internal private @__cxx_global_var_init.1()
+// CIR:   %[[HD:.*]] = cir.get_global @_ZN5Outer5Inner2hdE : !cir.ptr<!rec_HasDtor>
+// CIR:   %[[DTOR:.*]] = cir.get_global @_ZN7HasDtorD1Ev : !cir.ptr<!cir.func<(!cir.ptr<!rec_HasDtor>)>>
+// CIR:   %[[DTOR_CAST:.*]] = cir.cast bitcast %[[DTOR]] : !cir.ptr<!cir.func<(!cir.ptr<!rec_HasDtor>)>> -> !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
+// CIR:   %[[HD_CAST:.*]] = cir.cast bitcast %[[HD]] : !cir.ptr<!rec_HasDtor> -> !cir.ptr<!void>
+// CIR:   %[[HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr<i8>
+// CIR:   cir.call @__cxa_atexit(%[[DTOR_CAST]], %[[HD_CAST]], %[[HANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
+
+// LLVM: define internal void @__cxx_global_var_init.1()
+// LLVM:   call void @__cxa_atexit(ptr @_ZN7HasDtorD1Ev, ptr @_ZN5Outer5Inner2hdE, ptr @__dso_handle)
+
+// OGCG: define internal void @__cxx_global_var_init.1() {{.*}} section ".text.startup" comdat($_ZN5Outer5Inner2hdE) {
+// OGCG:   %[[GUARD:.*]] = load atomic i8, ptr @_ZGVN5Outer5Inner2hdE acquire
+// OGCG:   %[[UNINIT:.*]] = icmp eq i8 %[[GUARD]], 0
+// OGCG:   br i1 %[[UNINIT]], label %[[INIT_CHECK:.*]], label %[[INIT_END:.*]]
+// OGCG: [[INIT_CHECK:.*]]:
+// OGCG:   %[[GUARD_ACQUIRE:.*]] = call i32 @__cxa_guard_acquire(ptr @_ZGVN5Outer5Inner2hdE)
+// OGCG:   %[[TOBOOL:.*]] = icmp ne i32 %[[GUARD_ACQUIRE]], 0
+// OGCG:   br i1 %[[TOBOOL]], label %[[INIT:.*]], label %[[INIT_END]]
+// OGCG: [[INIT:.*]]:
+// OGCG:   %[[ATEXIT:.*]] = call i32 @__cxa_atexit(ptr @_ZN7HasDtorD1Ev, ptr @_ZN5Outer5Inner2hdE, ptr @__dso_handle)
+// OGCG:   call void @__cxa_guard_release(ptr @_ZGVN5Outer5Inner2hdE)
+// OGCG:   br label %[[INIT_END]]
+// OGCG: [[INIT_END]]:
+
+
+// CIR: cir.func private @_GLOBAL__sub_I_static_members.cpp()
+// CIR:   cir.call @__cxx_global_var_init()
+
+// LLVM: define void @_GLOBAL__sub_I_static_members.cpp()
+// LLVM:   call void @__cxx_global_var_init()
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-copyout.cpp b/clang/test/CIR/CodeGenOpenACC/declare-copyout.cpp
new file mode 100644
index 0000000000000..1d79cef894d5e
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-copyout.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare copyout(always:ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_CREATE:.*]] = acc.create varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_CREATE:.*]] = acc.create varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_CREATE]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {modifiers = #acc<data_clause_modifier always>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_CREATE]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {modifiers = #acc<data_clause_modifier always>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {modifiers = #acc<data_clause_modifier always>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copyout(alwaysout:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier alwaysout>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_CREATE:.*]] = acc.create varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier alwaysout>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier alwaysout>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare copyout(alwaysout:LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_CREATE:.*]] = acc.create varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_CREATE]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {modifiers = #acc<data_clause_modifier alwaysout>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {modifiers = #acc<data_clause_modifier alwaysout>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_CREATE]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {modifiers = #acc<data_clause_modifier alwaysout>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {modifiers = #acc<data_clause_modifier alwaysout>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare copyout(always:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_CREATE:.*]] = acc.create varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {dataClause = #acc<data_clause acc_copyout>, modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare copyout(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {dataClause = #acc<data_clause acc_copyout>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_CREATE:.*]] = acc.create varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copyout>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {dataClause = #acc<data_clause acc_copyout>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {name = "LocalHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_INT_CREATE]] : !cir.ptr<!s32i>) to varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) {name = "LocalInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) to varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) {name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) to varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) {modifiers = #acc<data_clause_modifier always>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_INT_CREATE]] : !cir.ptr<!s32i>) to varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) {modifiers = #acc<data_clause_modifier always>, name = "ArgInt"}
+    // CHECK-NEXT: acc.copyout accPtr(%[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) to varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) {modifiers = #acc<data_clause_modifier always>, name = "ArgHSEPtr[1:1]"}
+}
+
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-create.cpp b/clang/test/CIR/CodeGenOpenACC/declare-create.cpp
new file mode 100644
index 0000000000000..ef2f1de19ea96
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-create.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare create(zero:ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier zero>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_CREATE:.*]] = acc.create varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier zero>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier zero>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_CREATE:.*]] = acc.create varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier zero>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier zero>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {modifiers = #acc<data_clause_modifier zero>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_CREATE]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_CREATE]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare create(zero:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier zero>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_CREATE:.*]] = acc.create varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier zero>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier zero>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare create(zero:LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier zero>, name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_CREATE:.*]] = acc.create varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier zero>, name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {modifiers = #acc<data_clause_modifier zero>, name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_CREATE]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_CREATE]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare create(zero:ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {modifiers = #acc<data_clause_modifier zero>, name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_CREATE:.*]] = acc.create varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {modifiers = #acc<data_clause_modifier zero>, name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_CREATE:.*]] = acc.create varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {modifiers = #acc<data_clause_modifier zero>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare create(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_CREATE:.*]] = acc.create varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_CREATE:.*]] = acc.create varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_CREATE]], %[[LOC_INT_CREATE]], %[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_create>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_CREATE]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_create>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_CREATE]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_create>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_CREATE]], %[[ARG_INT_CREATE]], %[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_CREATE]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_CREATE]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_CREATE]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_create>, modifiers = #acc<data_clause_modifier zero>, name = "ArgHSEPtr[1:1]"}
+}
+
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-deviceresident.cpp b/clang/test/CIR/CodeGenOpenACC/declare-deviceresident.cpp
new file mode 100644
index 0000000000000..dbec4f22a1bb3
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-deviceresident.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare device_resident(ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_DEV_RES]], %[[ARG_INT_DEV_RES]], %[[LOC_HSE_DEV_RES]], %[[LOC_INT_DEV_RES]], %[[ARG_HSE_PTR_DEV_RES]], %[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_DEV_RES]], %[[ARG_INT_DEV_RES]], %[[LOC_HSE_DEV_RES]], %[[LOC_INT_DEV_RES]], %[[ARG_HSE_PTR_DEV_RES]], %[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_DEV_RES]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_DEV_RES]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare device_resident(ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_DEV_RES]], %[[ARG_INT_DEV_RES]], %[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare device_resident(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_DEV_RES]], %[[LOC_INT_DEV_RES]], %[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_DEV_RES]], %[[LOC_INT_DEV_RES]], %[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_DEV_RES]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_DEV_RES]], %[[ARG_INT_DEV_RES]], %[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_DEV_RES]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare device_resident(ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_DEV_RES]], %[[ARG_INT_DEV_RES]], %[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare device_resident(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_DEV_RES:.*]] = acc.declare_device_resident varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_DEV_RES]], %[[LOC_INT_DEV_RES]], %[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_DEV_RES]], %[[LOC_INT_DEV_RES]], %[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_DEV_RES]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_DEV_RES]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_DEV_RES]], %[[ARG_INT_DEV_RES]], %[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_DEV_RES]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_DEV_RES]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_DEV_RES]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_declare_device_resident>, name = "ArgHSEPtr[1:1]"}
+}
+
diff --git a/clang/test/CIR/CodeGenOpenACC/declare-present.cpp b/clang/test/CIR/CodeGenOpenACC/declare-present.cpp
new file mode 100644
index 0000000000000..c17b9597adf12
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/declare-present.cpp
@@ -0,0 +1,199 @@
+// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
+
+struct HasSideEffects {
+  HasSideEffects();
+  ~HasSideEffects();
+};
+
+// TODO: OpenACC: Implement 'global', NS lowering.
+
+struct Struct {
+  static const HasSideEffects StaticMemHSE;
+  static const HasSideEffects StaticMemHSEArr[5];
+  static const int StaticMemInt;
+
+  // TODO: OpenACC: Implement static-local lowering.
+
+  void MemFunc1(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc1{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    int LocalInt;
+
+#pragma acc declare present(ArgHSE, ArgInt, LocalHSE, LocalInt, ArgHSEPtr[1:1], LocalHSEArr[1:1])
+    // CHECK: %[[ARG_HSE_PRESENT:.*]] = acc.present varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_PRESENT:.*]] = acc.present varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "ArgInt"} 
+    // CHECK-NEXT: %[[LOC_HSE_PRESENT:.*]] = acc.present varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_PRESENT:.*]] = acc.present varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_PRESENT:.*]] = acc.present varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_PRESENT:.*]] = acc.present varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_PRESENT]], %[[ARG_INT_PRESENT]], %[[LOC_HSE_PRESENT]], %[[LOC_INT_PRESENT]], %[[ARG_HSE_PTR_PRESENT]], %[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER]]) dataOperands(%[[ARG_HSE_PRESENT]], %[[ARG_INT_PRESENT]], %[[LOC_HSE_PRESENT]], %[[LOC_INT_PRESENT]], %[[ARG_HSE_PTR_PRESENT]], %[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PRESENT]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_present>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_PRESENT]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_present>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_PRESENT]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_present>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_PRESENT]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_present>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_present>, name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_present>, name = "LocalHSEArr[1:1]"}
+  }
+  void MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr);
+};
+
+void use() {
+  Struct s;
+  s.MemFunc1(HasSideEffects{}, 0, nullptr);
+}
+
+void Struct::MemFunc2(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}MemFunc2{{.*}}(%{{.*}}: !cir.ptr<!rec_Struct>{{.*}}, %[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: cir.alloca{{.*}}["this"
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.load
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare present(ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_PRESENT:.*]] = acc.present varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_PRESENT:.*]] = acc.present varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_PRESENT:.*]] = acc.present varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_PRESENT]], %[[ARG_INT_PRESENT]], %[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+
+#pragma acc declare present(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_PRESENT:.*]] = acc.present varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_PRESENT:.*]] = acc.present varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_PRESENT:.*]] = acc.present varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_PRESENT]], %[[LOC_INT_PRESENT]], %[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_PRESENT]], %[[LOC_INT_PRESENT]], %[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_PRESENT]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_present>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_PRESENT]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_present>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_present>, name = "LocalHSEArr[1:1]"}
+    //
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_PRESENT]], %[[ARG_INT_PRESENT]], %[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PRESENT]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_present>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_PRESENT]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_present>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_present>, name = "ArgHSEPtr[1:1]"}
+}
+
+extern "C" void do_thing();
+
+extern "C" void NormalFunc(HasSideEffects ArgHSE, int ArgInt, HasSideEffects *ArgHSEPtr) {
+    // CHECK: cir.func {{.*}}NormalFunc(%[[ARG_HSE:.*]]: !rec_HasSideEffects{{.*}}, %[[ARG_INT:.*]]: !s32i {{.*}}, %[[ARG_HSE_PTR:.*]]: !cir.ptr<!rec_HasSideEffects>{{.*}})
+    // CHECK-NEXT: %[[ARG_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["ArgHSE"
+    // CHECK-NEXT: %[[ARG_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["ArgInt
+    // CHECK-NEXT: %[[ARG_HSE_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_HasSideEffects>{{.*}}["ArgHSEPtr"
+    // CHECK-NEXT: %[[LOC_HSE_ALLOCA:.*]] = cir.alloca !rec_HasSideEffects{{.*}}["LocalHSE
+    // CHECK-NEXT: %[[LOC_HSE_ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasSideEffects x 5>{{.*}}["LocalHSEArr
+    // CHECK-NEXT: %[[LOC_INT_ALLOCA:.*]] = cir.alloca !s32i{{.*}}["LocalInt
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    // CHECK-NEXT: cir.store
+    HasSideEffects LocalHSE;
+    // CHECK-NEXT: cir.call{{.*}} : (!cir.ptr<!rec_HasSideEffects>) -> ()
+    HasSideEffects LocalHSEArr[5];
+    // CHECK: do {
+    // CHECK: } while {
+    // CHECK: }
+    int LocalInt;
+#pragma acc declare present(ArgHSE, ArgInt, ArgHSEPtr[1:1])
+    // CHECK: %[[ARG_HSE_PRESENT:.*]] = acc.present varPtr(%[[ARG_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "ArgHSE"}
+    // CHECK-NEXT: %[[ARG_INT_PRESENT:.*]] = acc.present varPtr(%[[ARG_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "ArgInt"} 
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[ARG_HSE_PTR_PRESENT:.*]] = acc.present varPtr(%[[ARG_HSE_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) -> !cir.ptr<!cir.ptr<!rec_HasSideEffects>> {name = "ArgHSEPtr[1:1]"}
+    // CHECK-NEXT: %[[ENTER1:.*]] = acc.declare_enter dataOperands(%[[ARG_HSE_PRESENT]], %[[ARG_INT_PRESENT]], %[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+    {
+      // CHECK-NEXT: cir.scope {
+#pragma acc declare present(LocalHSE, LocalInt, LocalHSEArr[1:1])
+    // CHECK-NEXT: %[[LOC_HSE_PRESENT:.*]] = acc.present varPtr(%[[LOC_HSE_ALLOCA]] : !cir.ptr<!rec_HasSideEffects>) -> !cir.ptr<!rec_HasSideEffects> {name = "LocalHSE"} 
+    // CHECK-NEXT: %[[LOC_INT_PRESENT:.*]] = acc.present varPtr(%[[LOC_INT_ALLOCA]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "LocalInt"}
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[LB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+    // CHECK-NEXT: %[[UB:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32
+    // CHECK-NEXT: %[[IDX:.*]] = arith.constant 0 : i64
+    // CHECK-NEXT: %[[STRIDE:.*]] = arith.constant 1 : i64
+    // CHECK-NEXT: %[[BOUND2:.*]] = acc.bounds lowerbound(%[[LB]] : si32) extent(%[[UB]] : si32) stride(%[[STRIDE]] : i64) startIdx(%[[IDX]] : i64)
+    // CHECK-NEXT: %[[LOC_HSE_ARR_PRESENT:.*]] = acc.present varPtr(%[[LOC_HSE_ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) -> !cir.ptr<!cir.array<!rec_HasSideEffects x 5>> {name = "LocalHSEArr[1:1]"}
+    // CHECK-NEXT: %[[ENTER2:.*]] = acc.declare_enter dataOperands(%[[LOC_HSE_PRESENT]], %[[LOC_INT_PRESENT]], %[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER2]]) dataOperands(%[[LOC_HSE_PRESENT]], %[[LOC_INT_PRESENT]], %[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>)
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_PRESENT]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_present>, name = "LocalHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_INT_PRESENT]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_present>, name = "LocalInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[LOC_HSE_ARR_PRESENT]] : !cir.ptr<!cir.array<!rec_HasSideEffects x 5>>) bounds(%[[BOUND2]]) {dataClause = #acc<data_clause acc_present>, name = "LocalHSEArr[1:1]"}
+    }
+    // CHECK-NEXT: }
+
+    // Make sure that cleanup gets put in the right scope.
+    do_thing();
+    // CHECK-NEXT: cir.call @do_thing
+    // CHECK-NEXT: acc.declare_exit token(%[[ENTER1]]) dataOperands(%[[ARG_HSE_PRESENT]], %[[ARG_INT_PRESENT]], %[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!rec_HasSideEffects>, !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!rec_HasSideEffects>>)
+ 
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PRESENT]] : !cir.ptr<!rec_HasSideEffects>) {dataClause = #acc<data_clause acc_present>, name = "ArgHSE"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_INT_PRESENT]] : !cir.ptr<!s32i>) {dataClause = #acc<data_clause acc_present>, name = "ArgInt"}
+    // CHECK-NEXT: acc.delete accPtr(%[[ARG_HSE_PTR_PRESENT]] : !cir.ptr<!cir.ptr<!rec_HasSideEffects>>) bounds(%[[BOUND1]]) {dataClause = #acc<data_clause acc_present>, name = "ArgHSEPtr[1:1]"}
+}
+
diff --git a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp
index c8b85a12f84e7..43d91f180acaf 100644
--- a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp
@@ -1,8 +1,5 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fopenacc -fclangir -emit-cir %s -o %t.cir -verify
 
-void HelloWorld(int *A) {
-  extern int *E;
-
-// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: create}}
+int E, A;
+// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Declare Construct}}
 #pragma acc declare link(E) create(A)
-}
diff --git a/clang/test/CodeGen/attr-target-clones.c b/clang/test/CodeGen/attr-target-clones.c
index 295b25d6478eb..56db77c2b09a3 100644
--- a/clang/test/CodeGen/attr-target-clones.c
+++ b/clang/test/CodeGen/attr-target-clones.c
@@ -125,6 +125,35 @@ void __attribute__((target_clones("default, arch=ivybridge"))) unused(void) {}
 // WINDOWS: musttail call void @unused.arch_ivybridge.0
 // WINDOWS: musttail call void @unused.default.1
 
+int __attribute__((target_clones("sse4.2, default"))) inherited(void);
+int inherited(void) { return 0; }
+// LINUX: define {{.*}}i32 @inherited.sse4.2.0()
+// LINUX: define {{.*}}i32 @inherited.default.1()
+// LINUX: define weak_odr ptr @inherited.resolver() #[[ATTR_RESOLVER]] comdat
+// LINUX: ret ptr @inherited.sse4.2.0
+// LINUX: ret ptr @inherited.default.1
+
+// DARWIN: define {{.*}}i32 @inherited.sse4.2.0()
+// DARWIN: define {{.*}}i32 @inherited.default.1()
+// DARWIN: define weak_odr ptr @inherited.resolver() #[[ATTR_RESOLVER]] {
+// DARWIN: ret ptr @inherited.sse4.2.0
+// DARWIN: ret ptr @inherited.default.1
+
+// WINDOWS: define dso_local i32 @inherited.sse4.2.0()
+// WINDOWS: define dso_local i32 @inherited.default.1()
+// WINDOWS: define weak_odr dso_local i32 @inherited() #[[ATTR_RESOLVER]] comdat
+// WINDOWS: musttail call i32 @inherited.sse4.2.0
+// WINDOWS: musttail call i32 @inherited.default.1
+
+int test_inherited(void) {
+  // LINUX: define {{.*}}i32 @test_inherited() #[[DEF:[0-9]+]]
+  // DARWIN: define {{.*}}i32 @test_inherited() #[[DEF:[0-9]+]]
+  // WINDOWS: define dso_local i32 @test_inherited() #[[DEF:[0-9]+]]
+  return inherited();
+  // LINUX: call i32 @inherited()
+  // DARWIN: call i32 @inherited()
+  // WINDOWS: call i32 @inherited()
+}
 
 inline int __attribute__((target_clones("arch=sandybridge,default,sse4.2")))
 foo_inline(void) { return 0; }
diff --git a/clang/test/Sema/attr-target-clones.c b/clang/test/Sema/attr-target-clones.c
index 4597ea54d02bf..40688772eeb96 100644
--- a/clang/test/Sema/attr-target-clones.c
+++ b/clang/test/Sema/attr-target-clones.c
@@ -28,6 +28,17 @@ int __attribute__((target_clones("sse4.2", "arch=atom", "default"))) redecl4(voi
 int __attribute__((target_clones("sse4.2", "arch=sandybridge", "default")))
 redecl4(void) { return 1; }
 
+int __attribute__((target_clones("sse4.2", "default"))) redecl5(void);
+int redecl5(void) { return 1; }
+
+int redecl6(void);
+int __attribute__((target_clones("sse4.2", "default"))) redecl6(void) { return 1; }
+
+int __attribute__((target_clones("sse4.2", "default"))) redecl7(void);
+// expected-error@+2 {{multiversioning attributes cannot be combined}}
+// expected-note@-2 {{previous declaration is here}}
+int __attribute__((target("sse4.2"))) redecl7(void) { return 1; }
+
 int __attribute__((target("sse4.2"))) redef2(void) { return 1; }
 // expected-error@+2 {{multiversioning attributes cannot be combined}}
 // expected-note@-2 {{previous declaration is here}}
@@ -87,6 +98,8 @@ int useage(void) {
 int __attribute__((target_clones("sse4.2", "default"))) mv_after_use(void) { return 1; }
 
 void bad_overload1(void) __attribute__((target_clones("mmx", "sse4.2", "default")));
+// expected-error@+2 {{conflicting types for 'bad_overload1'}}
+// expected-note@-2 {{previous declaration is here}}
 void bad_overload1(int p) {}
 
 void bad_overload2(int p) {}
diff --git a/clang/unittests/AST/TypePrinterTest.cpp b/clang/unittests/AST/TypePrinterTest.cpp
index 410ec021d6e72..3cadf9b265bd1 100644
--- a/clang/unittests/AST/TypePrinterTest.cpp
+++ b/clang/unittests/AST/TypePrinterTest.cpp
@@ -341,3 +341,22 @@ TEST(TypePrinter, NestedNameSpecifiers) {
         Policy.AnonymousTagLocations = false;
       }));
 }
+
+TEST(TypePrinter, NestedNameSpecifiersTypedef) {
+  constexpr char Code[] = R"cpp(
+    typedef union {
+      struct {
+        struct {
+          unsigned int baz;
+        } bar;
+      };
+    } foo;
+  )cpp";
+
+  ASSERT_TRUE(PrintedTypeMatches(
+      Code, {}, fieldDecl(hasName("bar"), hasType(qualType().bind("id"))),
+      "struct foo::(anonymous struct)::(unnamed)", [](PrintingPolicy &Policy) {
+        Policy.FullyQualifiedName = true;
+        Policy.AnonymousTagLocations = false;
+      }));
+}
diff --git a/flang/test/Transforms/OpenACC/acc-implicit-data.fir b/flang/test/Transforms/OpenACC/acc-implicit-data.fir
index 7f6a57cb4d8c6..2d28c341d0d5e 100644
--- a/flang/test/Transforms/OpenACC/acc-implicit-data.fir
+++ b/flang/test/Transforms/OpenACC/acc-implicit-data.fir
@@ -133,7 +133,7 @@ func.func @test_fir_derivedtype_in_parallel_defaultpresent() {
   return
 }
 
-// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"}
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {acc.from_default, implicit = true, name = "aggrvar"}
 // CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"}
 
 // -----
@@ -147,7 +147,7 @@ func.func @test_fir_derivedtype_in_kernels_defaultpresent() {
   return
 }
 
-// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {implicit = true, name = "aggrvar"}
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) -> !fir.ref<!fir.type<_QFTaggr{field:f32}>> {acc.from_default, implicit = true, name = "aggrvar"}
 // CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.type<_QFTaggr{field:f32}>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "aggrvar"}
 
 // -----
@@ -161,7 +161,7 @@ func.func @test_fir_array_in_parallel_defaultpresent() {
   return
 }
 
-// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"}
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {acc.from_default, implicit = true, name = "arrayvar"}
 // CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"}
 
 // -----
@@ -175,7 +175,7 @@ func.func @test_fir_array_in_kernels_defaultpresent() {
   return
 }
 
-// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {implicit = true, name = "arrayvar"}
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : !fir.ref<!fir.array<10xf32>>) -> !fir.ref<!fir.array<10xf32>> {acc.from_default, implicit = true, name = "arrayvar"}
 // CHECK: acc.delete accPtr(%[[PRESENT]] : !fir.ref<!fir.array<10xf32>>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = "arrayvar"}
 
 // -----
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index b8e0e9b5a1814..6ea323ced3003 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -83,6 +83,8 @@ Improvements and New Features
   iterators, resulting in a performance improvement for ``std::deque<short>`` and
   ``std::join_view<vector<vector<short>>>`` iterators.
 
+- The ``num_get::do_get`` integral overloads have been optimized, resulting in a performance improvement of up to 2.8x.
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 4b2713191c1c0..f289666ec12ab 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -520,7 +520,6 @@ set(files
   __locale_dir/locale_base_api.h
   __locale_dir/locale_base_api/bsd_locale_fallbacks.h
   __locale_dir/locale_base_api/ibm.h
-  __locale_dir/locale_base_api/musl.h
   __locale_dir/locale_base_api/openbsd.h
   __locale_dir/messages.h
   __locale_dir/money.h
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index aaeb8a881df18..f73c9ea4b6ea7 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -114,6 +114,27 @@ template <class _VecT, class _Iter>
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
+// Load the first _Np elements, zero the rest
+_LIBCPP_DIAGNOSTIC_PUSH
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wpsabi")
+template <class _VecT, size_t _Np, class _Iter>
+[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __partial_load(_Iter __iter) noexcept {
+  return [=]<size_t... _LoadIndices, size_t... _ZeroIndices>(
+             index_sequence<_LoadIndices...>, index_sequence<_ZeroIndices...>) _LIBCPP_ALWAYS_INLINE noexcept {
+    return _VecT{__iter[_LoadIndices]..., ((void)_ZeroIndices, 0)...};
+  }(make_index_sequence<_Np>{}, make_index_sequence<__simd_vector_size_v<_VecT> - _Np>{});
+}
+
+// Create a vector where every elements is __val
+template <class _VecT>
+[[__nodiscard__]] _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT
+__broadcast(__simd_vector_underlying_type_t<_VecT> __val) {
+  return [&]<std::size_t... _Indices>(index_sequence<_Indices...>) {
+    return _VecT{((void)_Indices, __val)...};
+  }(make_index_sequence<__simd_vector_size_v<_VecT>>());
+}
+_LIBCPP_DIAGNOSTIC_POP
+
 template <class _Tp, size_t _Np>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
   return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
@@ -124,6 +145,11 @@ template <class _Tp, size_t _Np>
   return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
 }
 
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __none_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+  return !__builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
 template <class _Tp, size_t _Np>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
   using __mask_vec = __simd_vector<bool, _Np>;
diff --git a/libcxx/include/__config b/libcxx/include/__config
index d79ace0cbb896..1b27f28f9ddef 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -546,7 +546,10 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_(m)
 #  endif
 
-#  if defined(__DEPRECATED) && __DEPRECATED && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
+// FIXME: using `#warning` causes diagnostics from system headers which include deprecated headers. This can only be
+// enabled again once https://github.com/llvm/llvm-project/pull/168041 (or a similar feature) has landed, since that
+// allows suppression in system headers.
+#  if defined(__DEPRECATED) && __DEPRECATED && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS) && 0
 #    define _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS 1
 #  else
 #    define _LIBCPP_DIAGNOSE_DEPRECATED_HEADERS 0
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index fef90bb77991f..d26d529d4e0c2 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -57,8 +57,6 @@
 //  float               __strtof(const char*, char**, __locale_t);
 //  double              __strtod(const char*, char**, __locale_t);
 //  long double         __strtold(const char*, char**, __locale_t);
-//  long long           __strtoll(const char*, char**, __locale_t);
-//  unsigned long long  __strtoull(const char*, char**, __locale_t);
 // }
 //
 // Character manipulation functions
@@ -104,7 +102,6 @@
 //
 //  int     __snprintf(char*, size_t, __locale_t, const char*, ...); // required by the headers
 //  int     __asprintf(char**, __locale_t, const char*, ...);        // required by the headers
-//  int     __sscanf(const char*, __locale_t, const char*, ...);     // required by the headers
 // }
 
 #if _LIBCPP_HAS_LOCALIZATION
@@ -131,8 +128,6 @@
 #      include <__locale_dir/locale_base_api/ibm.h>
 #    elif defined(__OpenBSD__)
 #      include <__locale_dir/locale_base_api/openbsd.h>
-#    elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC
-#      include <__locale_dir/locale_base_api/musl.h>
 #    endif
 
 #    include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h>
@@ -192,15 +187,6 @@ inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __
   return strtold_l(__nptr, __endptr, __loc);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-  return strtoll_l(__nptr, __endptr, __base, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long
-__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-  return strtoull_l(__nptr, __endptr, __base, __loc);
-}
-
 //
 // Character manipulation functions
 //
@@ -299,11 +285,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __
     char** __s, __locale_t __loc, const char* __format, _Args&&... __args) {
   return std::__libcpp_asprintf_l(__s, __loc, __format, std::forward<_Args>(__args)...);
 }
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf(
-    const char* __s, __locale_t __loc, const char* __format, _Args&&... __args) {
-  return std::__libcpp_sscanf_l(__s, __loc, __format, std::forward<_Args>(__args)...);
-}
 _LIBCPP_DIAGNOSTIC_POP
 #    undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
 
diff --git a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
index b62a1b737e97f..8cdbe0cd15051 100644
--- a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
+++ b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
@@ -125,16 +125,6 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __libcpp_asprintf_l(
   return __res;
 }
 
-inline _LIBCPP_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __libcpp_sscanf_l(
-    const char* __s, locale_t __l, const char* __format, ...) {
-  va_list __va;
-  va_start(__va, __format);
-  __locale_guard __current(__l);
-  int __res = vsscanf(__s, __format, __va);
-  va_end(__va);
-  return __res;
-}
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/ibm.h b/libcxx/include/__locale_dir/locale_base_api/ibm.h
index 1d1d15df9f799..47a83eac7df36 100644
--- a/libcxx/include/__locale_dir/locale_base_api/ibm.h
+++ b/libcxx/include/__locale_dir/locale_base_api/ibm.h
@@ -53,11 +53,6 @@ struct __setAndRestore {
 
 // The following are not POSIX routines.  These are quick-and-dirty hacks
 // to make things pretend to work
-inline _LIBCPP_HIDE_FROM_ABI long long strtoll_l(const char* __nptr, char** __endptr, int __base, locale_t locale) {
-  __setAndRestore __newloc(locale);
-  return ::strtoll(__nptr, __endptr, __base);
-}
-
 inline _LIBCPP_HIDE_FROM_ABI double strtod_l(const char* __nptr, char** __endptr, locale_t locale) {
   __setAndRestore __newloc(locale);
   return ::strtod(__nptr, __endptr);
@@ -73,12 +68,6 @@ inline _LIBCPP_HIDE_FROM_ABI long double strtold_l(const char* __nptr, char** __
   return ::strtold(__nptr, __endptr);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long
-strtoull_l(const char* __nptr, char** __endptr, int __base, locale_t locale) {
-  __setAndRestore __newloc(locale);
-  return ::strtoull(__nptr, __endptr, __base);
-}
-
 inline _LIBCPP_HIDE_FROM_ABI
 _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 2, 0) int vasprintf(char** strp, const char* fmt, va_list ap) {
   const size_t buff_size = 256;
diff --git a/libcxx/include/__locale_dir/locale_base_api/musl.h b/libcxx/include/__locale_dir/locale_base_api/musl.h
deleted file mode 100644
index 1653214cdba1e..0000000000000
--- a/libcxx/include/__locale_dir/locale_base_api/musl.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// -*- C++ -*-
-//===-----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This adds support for the extended locale functions that are currently
-// missing from the Musl C library.
-//
-// This only works when the specified locale is "C" or "POSIX", but that's
-// about as good as we can do without implementing full xlocale support
-// in Musl.
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_MUSL_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_MUSL_H
-
-#include <cstdlib>
-#include <cwchar>
-
-inline _LIBCPP_HIDE_FROM_ABI long long strtoll_l(const char* __nptr, char** __endptr, int __base, locale_t) {
-  return ::strtoll(__nptr, __endptr, __base);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long strtoull_l(const char* __nptr, char** __endptr, int __base, locale_t) {
-  return ::strtoull(__nptr, __endptr, __base);
-}
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_MUSL_H
diff --git a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h
index 6eef9b3095d6e..98b8eb0c600f5 100644
--- a/libcxx/include/__locale_dir/num.h
+++ b/libcxx/include/__locale_dir/num.h
@@ -12,6 +12,7 @@
 #include <__algorithm/copy.h>
 #include <__algorithm/find.h>
 #include <__algorithm/reverse.h>
+#include <__algorithm/simd_utils.h>
 #include <__charconv/to_chars_integral.h>
 #include <__charconv/traits.h>
 #include <__config>
@@ -48,9 +49,9 @@ struct _LIBCPP_EXPORTED_FROM_ABI __num_get_base {
   static int __get_base(ios_base&);
   static const char __src[33]; // "0123456789abcdefABCDEFxX+-pPiInN"
   // count of leading characters in __src used for parsing integers ("012..X+-")
-  static const size_t __int_chr_cnt = 26;
+  static inline const size_t __int_chr_cnt = 26;
   // count of leading characters in __src used for parsing floating-point values ("012..-pP")
-  static const size_t __fp_chr_cnt = 28;
+  static inline const size_t __fp_chr_cnt = 28;
 };
 
 template <class _CharT>
@@ -73,7 +74,8 @@ struct __num_get : protected __num_get_base {
 
   [[__deprecated__("This exists only for ABI compatibility")]] static string
   __stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep);
-  static int __stage2_int_loop(
+
+  [[__deprecated__("This exists only for ABI compatibility")]] static int __stage2_int_loop(
       _CharT __ct,
       int __base,
       char* __a,
@@ -85,11 +87,24 @@ struct __num_get : protected __num_get_base {
       unsigned*& __g_end,
       _CharT* __atoms);
 
-  _LIBCPP_HIDE_FROM_ABI static string __stage2_int_prep(ios_base& __iob, _CharT& __thousands_sep) {
-    locale __loc                 = __iob.getloc();
-    const numpunct<_CharT>& __np = use_facet<numpunct<_CharT> >(__loc);
-    __thousands_sep              = __np.thousands_sep();
-    return __np.grouping();
+  _LIBCPP_HIDE_FROM_ABI static ptrdiff_t __atoms_offset(const _CharT* __atoms, _CharT __val) {
+    // TODO: Remove the manual vectorization once https://llvm.org/PR168551 is resolved
+#  if _LIBCPP_HAS_ALGORITHM_VECTOR_UTILS
+    if constexpr (is_same<_CharT, char>::value) {
+      // TODO(LLVM 24): This can be removed, since -Wpsabi doesn't warn on [[gnu::always_inline]] functions anymore.
+      _LIBCPP_DIAGNOSTIC_PUSH
+      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wpsabi")
+      using __vec   = __simd_vector<char, 32>;
+      __vec __chars = std::__broadcast<__vec>(__val);
+      __vec __cmp   = std::__partial_load<__vec, __int_chr_cnt>(__atoms);
+      auto __res    = __chars == __cmp;
+      if (std::__none_of(__res))
+        return __int_chr_cnt;
+      return std::min(__int_chr_cnt, std::__find_first_set(__res));
+      _LIBCPP_DIAGNOSTIC_POP
+    }
+#  endif
+    return std::find(__atoms, __atoms + __int_chr_cnt, __val) - __atoms;
   }
 
   _LIBCPP_HIDE_FROM_ABI const _CharT* __do_widen(ios_base& __iob, _CharT* __atoms) const {
@@ -122,54 +137,6 @@ string __num_get<_CharT>::__stage2_float_prep(
   return __np.grouping();
 }
 
-template <class _CharT>
-int __num_get<_CharT>::__stage2_int_loop(
-    _CharT __ct,
-    int __base,
-    char* __a,
-    char*& __a_end,
-    unsigned& __dc,
-    _CharT __thousands_sep,
-    const string& __grouping,
-    unsigned* __g,
-    unsigned*& __g_end,
-    _CharT* __atoms) {
-  if (__a_end == __a && (__ct == __atoms[24] || __ct == __atoms[25])) {
-    *__a_end++ = __ct == __atoms[24] ? '+' : '-';
-    __dc       = 0;
-    return 0;
-  }
-  if (__grouping.size() != 0 && __ct == __thousands_sep) {
-    if (__g_end - __g < __num_get_buf_sz) {
-      *__g_end++ = __dc;
-      __dc       = 0;
-    }
-    return 0;
-  }
-  ptrdiff_t __f = std::find(__atoms, __atoms + __int_chr_cnt, __ct) - __atoms;
-  if (__f >= 24)
-    return -1;
-  switch (__base) {
-  case 8:
-  case 10:
-    if (__f >= __base)
-      return -1;
-    break;
-  case 16:
-    if (__f < 22)
-      break;
-    if (__a_end != __a && __a_end - __a <= 2 && __a_end[-1] == '0') {
-      __dc       = 0;
-      *__a_end++ = __src[__f];
-      return 0;
-    }
-    return -1;
-  }
-  *__a_end++ = __src[__f];
-  ++__dc;
-  return 0;
-}
-
 template <class _CharT>
 int __num_get<_CharT>::__stage2_float_loop(
     _CharT __ct,
@@ -274,65 +241,6 @@ _LIBCPP_HIDE_FROM_ABI _Tp __num_get_float(const char* __a, const char* __a_end,
   return 0;
 }
 
-template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__num_get_signed_integral(const char* __a, const char* __a_end, ios_base::iostate& __err, int __base) {
-  if (__a != __a_end) {
-    __libcpp_remove_reference_t<decltype(errno)> __save_errno = errno;
-    errno                                                     = 0;
-    char* __p2;
-    long long __ll = __locale::__strtoll(__a, &__p2, __base, _LIBCPP_GET_C_LOCALE);
-    __libcpp_remove_reference_t<decltype(errno)> __current_errno = errno;
-    if (__current_errno == 0)
-      errno = __save_errno;
-    if (__p2 != __a_end) {
-      __err = ios_base::failbit;
-      return 0;
-    } else if (__current_errno == ERANGE || __ll < numeric_limits<_Tp>::min() || numeric_limits<_Tp>::max() < __ll) {
-      __err = ios_base::failbit;
-      if (__ll > 0)
-        return numeric_limits<_Tp>::max();
-      else
-        return numeric_limits<_Tp>::min();
-    }
-    return static_cast<_Tp>(__ll);
-  }
-  __err = ios_base::failbit;
-  return 0;
-}
-
-template <class _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__num_get_unsigned_integral(const char* __a, const char* __a_end, ios_base::iostate& __err, int __base) {
-  if (__a != __a_end) {
-    const bool __negate = *__a == '-';
-    if (__negate && ++__a == __a_end) {
-      __err = ios_base::failbit;
-      return 0;
-    }
-    __libcpp_remove_reference_t<decltype(errno)> __save_errno = errno;
-    errno                                                     = 0;
-    char* __p2;
-    unsigned long long __ll = __locale::__strtoull(__a, &__p2, __base, _LIBCPP_GET_C_LOCALE);
-    __libcpp_remove_reference_t<decltype(errno)> __current_errno = errno;
-    if (__current_errno == 0)
-      errno = __save_errno;
-    if (__p2 != __a_end) {
-      __err = ios_base::failbit;
-      return 0;
-    } else if (__current_errno == ERANGE || numeric_limits<_Tp>::max() < __ll) {
-      __err = ios_base::failbit;
-      return numeric_limits<_Tp>::max();
-    }
-    _Tp __res = static_cast<_Tp>(__ll);
-    if (__negate)
-      __res = -__res;
-    return __res;
-  }
-  __err = ios_base::failbit;
-  return 0;
-}
-
 template <class _CharT, class _InputIterator = istreambuf_iterator<_CharT> >
 class num_get : public locale::facet, private __num_get<_CharT> {
 public:
@@ -470,137 +378,194 @@ class num_get : public locale::facet, private __num_get<_CharT> {
     return __b;
   }
 
-  template <class _Signed>
-  _LIBCPP_HIDE_FROM_ABI iter_type
-  __do_get_signed(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Signed& __v) const {
+  template <class _MaybeSigned>
+  iter_type __do_get_integral(
+      iter_type __first, iter_type __last, ios_base& __iob, ios_base::iostate& __err, _MaybeSigned& __v) const {
+    using _Unsigned = __make_unsigned_t<_MaybeSigned>;
+
     // Stage 1
     int __base = this->__get_base(__iob);
-    // Stage 2
-    char_type __thousands_sep;
-    const int __atoms_size = __num_get_base::__int_chr_cnt;
-    char_type __atoms1[__atoms_size];
-    const char_type* __atoms = this->__do_widen(__iob, __atoms1);
-    string __grouping        = this->__stage2_int_prep(__iob, __thousands_sep);
-    string __buf;
-    __buf.resize(__buf.capacity());
-    char* __a     = &__buf[0];
-    char* __a_end = __a;
+
+    // Stages 2 & 3
+    // These are combined into a single step where we parse the characters and calculate the value in one go instead of
+    // storing the relevant characters first (in an allocated buffer) and parse the characters after we extracted them.
+    // This makes the whole process significantly faster, since we avoid potential allocations and copies.
+
+    const auto& __numpunct    = use_facet<numpunct<_CharT> >(__iob.getloc());
+    char_type __thousands_sep = __numpunct.thousands_sep();
+    string __grouping         = __numpunct.grouping();
+
+    char_type __atoms_buffer[__num_get_base::__int_chr_cnt];
+    const char_type* __atoms = this->__do_widen(__iob, __atoms_buffer);
     unsigned __g[__num_get_base::__num_get_buf_sz];
     unsigned* __g_end = __g;
     unsigned __dc     = 0;
-    for (; __b != __e; ++__b) {
-      if (__a_end == __a + __buf.size()) {
-        size_t __tmp = __buf.size();
-        __buf.resize(2 * __buf.size());
-        __buf.resize(__buf.capacity());
-        __a     = &__buf[0];
-        __a_end = __a + __tmp;
+
+    if (__first == __last) {
+      __err |= ios_base::eofbit | ios_base::failbit;
+      __v = 0;
+      return __first;
+    }
+
+    while (!__grouping.empty() && *__first == __thousands_sep) {
+      ++__first;
+      if (__g_end - __g < this->__num_get_buf_sz)
+        *__g_end++ = 0;
+    }
+
+    bool __negate = false;
+    // __c == '+' || __c == '-'
+    if (auto __c = *__first; __c == __atoms[24] || __c == __atoms[25]) {
+      __negate = __c == __atoms[25];
+      ++__first;
+    }
+
+    if (__first == __last) {
+      __err |= ios_base::eofbit | ios_base::failbit;
+      __v = 0;
+      return __first;
+    }
+
+    bool __parsed_num = false;
+
+    // If we don't have a pre-set base, figure it out and swallow any prefix
+    if (__base == 0) {
+      auto __c = *__first;
+      // __c == '0'
+      if (__c == __atoms[0]) {
+        ++__first;
+        if (__first == __last) {
+          __err |= ios_base::eofbit;
+          return __first;
+        }
+        // __c2 == 'x' || __c2 == 'X'
+        if (auto __c2 = *__first; __c2 == __atoms[22] || __c2 == __atoms[23]) {
+          __base = 16;
+          ++__first;
+        } else {
+          __base = 8;
+        }
+      } else {
+        __base = 10;
+      }
+
+      // If the base has been specified explicitly, try to swallow the appropriate prefix. We only need to do something
+      // special for hex, since decimal has no prefix and octal's prefix is '0', which doesn't change the value that
+      // we'll parse if we don't swallow it.
+    } else if (__base == 16) {
+      // Try to swallow '0x'
+
+      // *__first == '0'
+      if (*__first == __atoms[0]) {
+        ++__first;
+        if (__first == __last) {
+          __err |= ios_base::eofbit;
+          __v = 0;
+          return __first;
+        }
+        // __c == 'x' || __c == 'X'
+        if (auto __c = *__first; __c == __atoms[22] || __c == __atoms[23])
+          ++__first;
+        else
+          __parsed_num = true; // We only swallowed '0', so we've started to parse a number
       }
-      if (this->__stage2_int_loop(
-              *__b,
-              __base,
-              __a,
-              __a_end,
-              __dc,
-              __thousands_sep,
-              __grouping,
-              __g,
-              __g_end,
-              const_cast<char_type*>(__atoms)))
-        break;
     }
-    if (__grouping.size() != 0 && __g_end - __g < __num_get_base::__num_get_buf_sz)
-      *__g_end++ = __dc;
-    // Stage 3
-    __v = std::__num_get_signed_integral<_Signed>(__a, __a_end, __err, __base);
-    // Digit grouping checked
-    __check_grouping(__grouping, __g, __g_end, __err);
-    // EOF checked
-    if (__b == __e)
-      __err |= ios_base::eofbit;
-    return __b;
-  }
 
-  template <class _Unsigned>
-  _LIBCPP_HIDE_FROM_ABI iter_type
-  __do_get_unsigned(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Unsigned& __v) const {
-    // Stage 1
-    int __base = this->__get_base(__iob);
-    // Stage 2
-    char_type __thousands_sep;
-    const int __atoms_size = __num_get_base::__int_chr_cnt;
-    char_type __atoms1[__atoms_size];
-    const char_type* __atoms = this->__do_widen(__iob, __atoms1);
-    string __grouping        = this->__stage2_int_prep(__iob, __thousands_sep);
-    string __buf;
-    __buf.resize(__buf.capacity());
-    char* __a     = &__buf[0];
-    char* __a_end = __a;
-    unsigned __g[__num_get_base::__num_get_buf_sz];
-    unsigned* __g_end = __g;
-    unsigned __dc     = 0;
-    for (; __b != __e; ++__b) {
-      if (__a_end == __a + __buf.size()) {
-        size_t __tmp = __buf.size();
-        __buf.resize(2 * __buf.size());
-        __buf.resize(__buf.capacity());
-        __a     = &__buf[0];
-        __a_end = __a + __tmp;
+    // Calculate the actual number
+    _Unsigned __val   = 0;
+    bool __overflowed = false;
+    for (; __first != __last; ++__first) {
+      auto __c = *__first;
+      if (!__grouping.empty() && __c == __thousands_sep) {
+        if (__g_end - __g < this->__num_get_buf_sz) {
+          *__g_end++ = __dc;
+          __dc       = 0;
+        }
+        continue;
       }
-      if (this->__stage2_int_loop(
-              *__b,
-              __base,
-              __a,
-              __a_end,
-              __dc,
-              __thousands_sep,
-              __grouping,
-              __g,
-              __g_end,
-              const_cast<char_type*>(__atoms)))
+      auto __offset = this->__atoms_offset(__atoms, __c);
+      if (__offset >= 22) // Not a valid integer character
+        break;
+
+      if (__base == 16 && __offset >= 16)
+        __offset -= 6;
+      if (__offset >= __base)
         break;
+      // __val = (__val * __base) + __offset
+      __overflowed |= __builtin_mul_overflow(__val, __base, std::addressof(__val)) ||
+                      __builtin_add_overflow(__val, __offset, std::addressof(__val));
+      __parsed_num = true;
+      ++__dc;
     }
+
+    if (!__parsed_num) {
+      __err |= ios_base::failbit;
+      __v = 0;
+    } else if (__overflowed) {
+      __err |= ios_base::failbit;
+      __v = is_signed<_MaybeSigned>::value && __negate
+              ? numeric_limits<_MaybeSigned>::min()
+              : numeric_limits<_MaybeSigned>::max();
+    } else if (!__negate) {
+      if (__val > static_cast<_Unsigned>(numeric_limits<_MaybeSigned>::max())) {
+        __err |= ios_base::failbit;
+        __v = numeric_limits<_MaybeSigned>::max();
+      } else {
+        __v = __val;
+      }
+    } else if (is_signed<_MaybeSigned>::value) {
+      if (__val > static_cast<_Unsigned>(numeric_limits<_MaybeSigned>::max()) + 1) {
+        __err |= ios_base::failbit;
+        __v = numeric_limits<_MaybeSigned>::min();
+      } else if (__val == static_cast<_Unsigned>(numeric_limits<_MaybeSigned>::max()) + 1) {
+        __v = numeric_limits<_MaybeSigned>::min();
+      } else {
+        __v = -__val;
+      }
+    } else {
+      __v = -__val;
+    }
+
     if (__grouping.size() != 0 && __g_end - __g < __num_get_base::__num_get_buf_sz)
       *__g_end++ = __dc;
-    // Stage 3
-    __v = std::__num_get_unsigned_integral<_Unsigned>(__a, __a_end, __err, __base);
+
     // Digit grouping checked
     __check_grouping(__grouping, __g, __g_end, __err);
     // EOF checked
-    if (__b == __e)
+    if (__first == __last)
       __err |= ios_base::eofbit;
-    return __b;
+    return __first;
   }
 
   virtual iter_type do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, bool& __v) const;
 
   virtual iter_type do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, long& __v) const {
-    return this->__do_get_signed(__b, __e, __iob, __err, __v);
+    return this->__do_get_integral(__b, __e, __iob, __err, __v);
   }
 
   virtual iter_type
   do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, long long& __v) const {
-    return this->__do_get_signed(__b, __e, __iob, __err, __v);
+    return this->__do_get_integral(__b, __e, __iob, __err, __v);
   }
 
   virtual iter_type
   do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, unsigned short& __v) const {
-    return this->__do_get_unsigned(__b, __e, __iob, __err, __v);
+    return this->__do_get_integral(__b, __e, __iob, __err, __v);
   }
 
   virtual iter_type
   do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, unsigned int& __v) const {
-    return this->__do_get_unsigned(__b, __e, __iob, __err, __v);
+    return this->__do_get_integral(__b, __e, __iob, __err, __v);
   }
 
   virtual iter_type
   do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, unsigned long& __v) const {
-    return this->__do_get_unsigned(__b, __e, __iob, __err, __v);
+    return this->__do_get_integral(__b, __e, __iob, __err, __v);
   }
 
   virtual iter_type
   do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, unsigned long long& __v) const {
-    return this->__do_get_unsigned(__b, __e, __iob, __err, __v);
+    return this->__do_get_integral(__b, __e, __iob, __err, __v);
   }
 
   virtual iter_type do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, float& __v) const {
@@ -654,40 +619,13 @@ _InputIterator num_get<_CharT, _InputIterator>::do_get(
 template <class _CharT, class _InputIterator>
 _InputIterator num_get<_CharT, _InputIterator>::do_get(
     iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, void*& __v) const {
-  // Stage 1
-  int __base = 16;
-  // Stage 2
-  char_type __atoms[__num_get_base::__int_chr_cnt];
-  char_type __thousands_sep = char_type();
-  string __grouping;
-  std::use_facet<ctype<_CharT> >(__iob.getloc())
-      .widen(__num_get_base::__src, __num_get_base::__src + __num_get_base::__int_chr_cnt, __atoms);
-  string __buf;
-  __buf.resize(__buf.capacity());
-  char* __a     = &__buf[0];
-  char* __a_end = __a;
-  unsigned __g[__num_get_base::__num_get_buf_sz];
-  unsigned* __g_end = __g;
-  unsigned __dc     = 0;
-  for (; __b != __e; ++__b) {
-    if (__a_end == __a + __buf.size()) {
-      size_t __tmp = __buf.size();
-      __buf.resize(2 * __buf.size());
-      __buf.resize(__buf.capacity());
-      __a     = &__buf[0];
-      __a_end = __a + __tmp;
-    }
-    if (this->__stage2_int_loop(*__b, __base, __a, __a_end, __dc, __thousands_sep, __grouping, __g, __g_end, __atoms))
-      break;
-  }
-  // Stage 3
-  __buf.resize(__a_end - __a);
-  if (__locale::__sscanf(__buf.c_str(), _LIBCPP_GET_C_LOCALE, "%p", &__v) != 1)
-    __err = ios_base::failbit;
-  // EOF checked
-  if (__b == __e)
-    __err |= ios_base::eofbit;
-  return __b;
+  auto __flags = __iob.flags();
+  __iob.flags((__flags & ~ios_base::basefield & ~ios_base::uppercase) | ios_base::hex);
+  uintptr_t __ptr;
+  auto __res = __do_get_integral(__b, __e, __iob, __err, __ptr);
+  __iob.flags(__flags);
+  __v = reinterpret_cast<void*>(__ptr);
+  return __res;
 }
 
 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS num_get<char>;
diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h
index 27735529d5524..6f533b4e1eab1 100644
--- a/libcxx/include/__locale_dir/support/bsd_like.h
+++ b/libcxx/include/__locale_dir/support/bsd_like.h
@@ -79,15 +79,6 @@ inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __
   return ::strtold_l(__nptr, __endptr, __loc);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-  return ::strtoll_l(__nptr, __endptr, __base, __loc);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long
-__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-  return ::strtoull_l(__nptr, __endptr, __base, __loc);
-}
-
 //
 // Character manipulation functions
 //
@@ -211,12 +202,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __
     char** __s, __locale_t __loc, const char* __format, _Args&&... __args) {
   return ::asprintf_l(__s, __loc, __format, std::forward<_Args>(__args)...); // non-standard
 }
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf(
-    const char* __s, __locale_t __loc, const char* __format, _Args&&... __args) {
-  return ::sscanf_l(__s, __loc, __format, std::forward<_Args>(__args)...);
-}
 _LIBCPP_DIAGNOSTIC_POP
 #undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
 
diff --git a/libcxx/include/__locale_dir/support/fuchsia.h b/libcxx/include/__locale_dir/support/fuchsia.h
index 4b9e63facb19e..528bfeb0cb6e1 100644
--- a/libcxx/include/__locale_dir/support/fuchsia.h
+++ b/libcxx/include/__locale_dir/support/fuchsia.h
@@ -141,13 +141,6 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __
   __locale_guard __current(__loc);
   return ::asprintf(__s, __format, std::forward<_Args>(__args)...); // non-standard
 }
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf(
-    const char* __s, __locale_t __loc, const char* __format, _Args&&... __args) {
-  __locale_guard __current(__loc);
-  return std::sscanf(__s, __format, std::forward<_Args>(__args)...);
-}
-
 _LIBCPP_DIAGNOSTIC_POP
 #undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
 
diff --git a/libcxx/include/__locale_dir/support/linux.h b/libcxx/include/__locale_dir/support/linux.h
index 94a2ecb9a940d..1a589be49bf1d 100644
--- a/libcxx/include/__locale_dir/support/linux.h
+++ b/libcxx/include/__locale_dir/support/linux.h
@@ -94,25 +94,6 @@ inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __
   return ::strtold_l(__nptr, __endptr, __loc);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-#if !_LIBCPP_HAS_MUSL_LIBC
-  return ::strtoll_l(__nptr, __endptr, __base, __loc);
-#else
-  (void)__loc;
-  return ::strtoll(__nptr, __endptr, __base);
-#endif
-}
-
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long
-__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-#if !_LIBCPP_HAS_MUSL_LIBC
-  return ::strtoull_l(__nptr, __endptr, __base, __loc);
-#else
-  (void)__loc;
-  return ::strtoull(__nptr, __endptr, __base);
-#endif
-}
-
 //
 // Character manipulation functions
 //
@@ -257,20 +238,6 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __asprintf(
   va_end(__va);
   return __res;
 }
-
-#ifndef _LIBCPP_COMPILER_GCC // GCC complains that this can't be always_inline due to C-style varargs
-_LIBCPP_HIDE_FROM_ABI
-#endif
-inline _LIBCPP_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf(
-    const char* __s, __locale_t __loc, const char* __format, ...) {
-  va_list __va;
-  va_start(__va, __format);
-  __locale_guard __current(__loc);
-  int __res = std::vsscanf(__s, __format, __va);
-  va_end(__va);
-  return __res;
-}
-
 } // namespace __locale
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__locale_dir/support/no_locale/strtonum.h b/libcxx/include/__locale_dir/support/no_locale/strtonum.h
index 0e7a32993e736..59544e10e4a4c 100644
--- a/libcxx/include/__locale_dir/support/no_locale/strtonum.h
+++ b/libcxx/include/__locale_dir/support/no_locale/strtonum.h
@@ -34,15 +34,6 @@ inline _LIBCPP_HIDE_FROM_ABI long double __strtold(const char* __nptr, char** __
   return std::strtold(__nptr, __endptr);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t) {
-  return std::strtoll(__nptr, __endptr, __base);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long
-__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t) {
-  return std::strtoull(__nptr, __endptr, __base);
-}
-
 } // namespace __locale
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h
index edd8a66c23e80..644ef68adf545 100644
--- a/libcxx/include/__locale_dir/support/windows.h
+++ b/libcxx/include/__locale_dir/support/windows.h
@@ -186,14 +186,6 @@ inline _LIBCPP_HIDE_FROM_ABI double __strtod(const char* __nptr, char** __endptr
   return ::_strtod_l(__nptr, __endptr, __loc);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI long long __strtoll(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-  return ::_strtoi64_l(__nptr, __endptr, __base, __loc);
-}
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long
-__strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) {
-  return ::_strtoui64_l(__nptr, __endptr, __base, __loc);
-}
-
 //
 // Character manipulation functions
 //
@@ -276,23 +268,6 @@ _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __snpri
 _LIBCPP_EXPORTED_FROM_ABI
 _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __asprintf(char** __ret, __locale_t __loc, const char* __format, ...);
 
-_LIBCPP_DIAGNOSTIC_PUSH
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat")
-_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wformat-nonliteral") // GCC doesn't support [[gnu::format]] on variadic templates
-#ifdef _LIBCPP_COMPILER_CLANG_BASED
-#  define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__)
-#else
-#  define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) /* nothing */
-#endif
-
-template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __sscanf(
-    const char* __dest, __locale_t __loc, const char* __format, _Args&&... __args) {
-  return ::_sscanf_l(__dest, __format, __loc, std::forward<_Args>(__args)...);
-}
-_LIBCPP_DIAGNOSTIC_POP
-#undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
-
 #if defined(_LIBCPP_BUILDING_LIBRARY)
 struct __locale_guard {
   _LIBCPP_HIDE_FROM_ABI __locale_guard(__locale_t __l) : __status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
diff --git a/libcxx/include/__support/xlocale/__strtonum_fallback.h b/libcxx/include/__support/xlocale/__strtonum_fallback.h
index 5275aead35af9..90bd59d36c248 100644
--- a/libcxx/include/__support/xlocale/__strtonum_fallback.h
+++ b/libcxx/include/__support/xlocale/__strtonum_fallback.h
@@ -34,12 +34,4 @@ inline _LIBCPP_HIDE_FROM_ABI long double strtold_l(const char* __nptr, char** __
   return ::strtold(__nptr, __endptr);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI long long strtoll_l(const char* __nptr, char** __endptr, int __base, locale_t) {
-  return ::strtoll(__nptr, __endptr, __base);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI unsigned long long strtoull_l(const char* __nptr, char** __endptr, int __base, locale_t) {
-  return ::strtoull(__nptr, __endptr, __base);
-}
-
 #endif // _LIBCPP___SUPPORT_XLOCALE_STRTONUM_FALLBACK_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 57d66cd1ccaef..492708792cbbf 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1591,7 +1591,6 @@ module std [system] {
     module locale_base_api {
       textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h"
       textual header "__locale_dir/locale_base_api/ibm.h"
-      textual header "__locale_dir/locale_base_api/musl.h"
       textual header "__locale_dir/locale_base_api/openbsd.h"
     }
     export *
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index 5ecaa3de7deba..5dd04a9ba8479 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -362,11 +362,11 @@ public:
 #  endif
 
   // [string.view.iterators], iterators
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return cbegin(); }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return cbegin(); }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return cend(); }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return cend(); }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
 #  ifdef _LIBCPP_ABI_BOUNDED_ITERATORS
     return std::__make_bounded_iter(data(), data(), data() + size());
 #  else
@@ -374,7 +374,7 @@ public:
 #  endif
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
 #  ifdef _LIBCPP_ABI_BOUNDED_ITERATORS
     return std::__make_bounded_iter(data() + size(), data(), data() + size());
 #  else
@@ -382,51 +382,54 @@ public:
 #  endif
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator
+  rbegin() const _NOEXCEPT {
     return const_reverse_iterator(cend());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT {
     return const_reverse_iterator(cbegin());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator
+  crbegin() const _NOEXCEPT {
     return const_reverse_iterator(cend());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX17 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT {
     return const_reverse_iterator(cbegin());
   }
 
   // [string.view.capacity], capacity
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type length() const _NOEXCEPT { return __size_; }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type length() const _NOEXCEPT { return __size_; }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return __size_ == 0; }
 
   // [string.view.access], element access
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference operator[](size_type __pos) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference
+  operator[](size_type __pos) const _NOEXCEPT {
     return _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos < size(), "string_view[] index out of bounds"), __data_[__pos];
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __pos) const {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference at(size_type __pos) const {
     return __pos >= size() ? (__throw_out_of_range("string_view::at"), __data_[0]) : __data_[__pos];
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference front() const _NOEXCEPT {
     return _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string_view::front(): string is empty"), __data_[0];
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_reference back() const _NOEXCEPT {
     return _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "string_view::back(): string is empty"), __data_[__size_ - 1];
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_pointer data() const _NOEXCEPT { return __data_; }
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI const_pointer data() const _NOEXCEPT { return __data_; }
 
   // [string.view.modifiers], modifiers:
   _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI void remove_prefix(size_type __n) _NOEXCEPT {
@@ -459,7 +462,8 @@ public:
     return __rlen;
   }
 
-  _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view substr(size_type __pos = 0, size_type __n = npos) const {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view
+  substr(size_type __pos = 0, size_type __n = npos) const {
     // Use the `__assume_valid` form of the constructor to avoid an unnecessary check. Any substring of a view is a
     // valid view. In particular, `size()` is known to be smaller than `numeric_limits<difference_type>::max()`, so the
     // new size is also smaller. See also https://llvm.org/PR91634.
@@ -474,7 +478,7 @@ public:
   }
 #  endif
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 int compare(basic_string_view __sv) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 int compare(basic_string_view __sv) const _NOEXCEPT {
     size_type __rlen = std::min(size(), __sv.size());
     int __retval     = _Traits::compare(data(), __sv.data(), __rlen);
     if (__retval == 0) // first __rlen chars matched
@@ -482,50 +486,51 @@ public:
     return __retval;
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
   compare(size_type __pos1, size_type __n1, basic_string_view __sv) const {
     return substr(__pos1, __n1).compare(__sv);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
   compare(size_type __pos1, size_type __n1, basic_string_view __sv, size_type __pos2, size_type __n2) const {
     return substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
   compare(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT {
     return compare(basic_string_view(__s));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
   compare(size_type __pos1, size_type __n1, const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
     return substr(__pos1, __n1).compare(basic_string_view(__s));
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int
   compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero") {
     return substr(__pos1, __n1).compare(basic_string_view(__s, __n2));
   }
 
   // find
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s.data(), __pos, __s.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type find(_CharT __c, size_type __pos = 0) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  find(_CharT __c, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find(): received nullptr");
     return std::__str_find<value_type, size_type, traits_type, npos>(
@@ -533,24 +538,24 @@ public:
   }
 
   // rfind
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   rfind(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s.data(), __pos, __s.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   rfind(_CharT __c, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   rfind(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::rfind(): received nullptr");
     return std::__str_rfind<value_type, size_type, traits_type, npos>(
@@ -558,25 +563,25 @@ public:
   }
 
   // find_first_of
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_of(_CharT __c, size_type __pos = 0) const _NOEXCEPT {
     return find(__c, __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_of(): received nullptr");
     return std::__str_find_first_of<value_type, size_type, traits_type, npos>(
@@ -584,25 +589,25 @@ public:
   }
 
   // find_last_of
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_of(_CharT __c, size_type __pos = npos) const _NOEXCEPT {
     return rfind(__c, __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_of(): received nullptr");
     return std::__str_find_last_of<value_type, size_type, traits_type, npos>(
@@ -610,25 +615,25 @@ public:
   }
 
   // find_first_not_of
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_not_of(basic_string_view __s, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_not_of(_CharT __c, size_type __pos = 0) const _NOEXCEPT {
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_first_not_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_not_of(): received nullptr");
     return std::__str_find_first_not_of<value_type, size_type, traits_type, npos>(
@@ -636,25 +641,25 @@ public:
   }
 
   // find_last_not_of
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_not_of(basic_string_view __s, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
         data(), size(), __s.data(), __pos, __s.size());
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_not_of(_CharT __c, size_type __pos = npos) const _NOEXCEPT {
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __c, __pos);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT
       _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") {
     _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(data(), size(), __s, __pos, __n);
   }
 
-  _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type
   find_last_not_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT {
     _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_not_of(): received nullptr");
     return std::__str_find_last_not_of<value_type, size_type, traits_type, npos>(
@@ -662,37 +667,43 @@ public:
   }
 
 #  if _LIBCPP_STD_VER >= 20
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(basic_string_view __s) const noexcept {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(basic_string_view __s) const noexcept {
     return size() >= __s.size() && compare(0, __s.size(), __s) == 0;
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(value_type __c) const noexcept {
     return !empty() && _Traits::eq(front(), __c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool
+  starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
     return starts_with(basic_string_view(__s));
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(basic_string_view __s) const noexcept {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(basic_string_view __s) const noexcept {
     return size() >= __s.size() && compare(size() - __s.size(), npos, __s) == 0;
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(value_type __c) const noexcept {
     return !empty() && _Traits::eq(back(), __c);
   }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool
+  ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept {
     return ends_with(basic_string_view(__s));
   }
 #  endif
 
 #  if _LIBCPP_STD_VER >= 23
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(basic_string_view __sv) const noexcept { return find(__sv) != npos; }
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(basic_string_view __sv) const noexcept {
+    return find(__sv) != npos;
+  }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept { return find(__c) != npos; }
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept {
+    return find(__c) != npos;
+  }
 
-  constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
+  [[nodiscard]] constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const {
     return find(__s) != npos;
   }
 #  endif
@@ -897,7 +908,8 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Trai
 // [string.view.hash]
 template <class _CharT>
 struct __string_view_hash : public __unary_function<basic_string_view<_CharT, char_traits<_CharT> >, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t operator()(const basic_string_view<_CharT, char_traits<_CharT> > __val) const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t
+  operator()(const basic_string_view<_CharT, char_traits<_CharT> > __val) const _NOEXCEPT {
     return std::__do_string_hash(__val.data(), __val.data() + __val.size());
   }
 };
@@ -924,30 +936,31 @@ struct hash<basic_string_view<wchar_t, char_traits<wchar_t> > > : __string_view_
 #  if _LIBCPP_STD_VER >= 14
 inline namespace literals {
 inline namespace string_view_literals {
-inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char> operator""sv(const char* __str, size_t __len) noexcept {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char>
+operator""sv(const char* __str, size_t __len) noexcept {
   return basic_string_view<char>(__str, __len);
 }
 
 #    if _LIBCPP_HAS_WIDE_CHARACTERS
-inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<wchar_t>
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<wchar_t>
 operator""sv(const wchar_t* __str, size_t __len) noexcept {
   return basic_string_view<wchar_t>(__str, __len);
 }
 #    endif
 
 #    if _LIBCPP_HAS_CHAR8_T
-inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char8_t>
+[[nodiscard]] inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char8_t>
 operator""sv(const char8_t* __str, size_t __len) noexcept {
   return basic_string_view<char8_t>(__str, __len);
 }
 #    endif
 
-inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char16_t>
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char16_t>
 operator""sv(const char16_t* __str, size_t __len) noexcept {
   return basic_string_view<char16_t>(__str, __len);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char32_t>
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string_view<char32_t>
 operator""sv(const char32_t* __str, size_t __len) noexcept {
   return basic_string_view<char32_t>(__str, __len);
 }
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index 0f695d4f1a229..2081e75fdf64b 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -5557,6 +5557,54 @@ string __num_get<_CharT>::__stage2_int_prep(ios_base& __iob, _CharT* __atoms, _C
   return __np.grouping();
 }
 
+template <class _CharT>
+int __num_get<_CharT>::__stage2_int_loop(
+    _CharT __ct,
+    int __base,
+    char* __a,
+    char*& __a_end,
+    unsigned& __dc,
+    _CharT __thousands_sep,
+    const string& __grouping,
+    unsigned* __g,
+    unsigned*& __g_end,
+    _CharT* __atoms) {
+  if (__a_end == __a && (__ct == __atoms[24] || __ct == __atoms[25])) {
+    *__a_end++ = __ct == __atoms[24] ? '+' : '-';
+    __dc       = 0;
+    return 0;
+  }
+  if (__grouping.size() != 0 && __ct == __thousands_sep) {
+    if (__g_end - __g < __num_get_buf_sz) {
+      *__g_end++ = __dc;
+      __dc       = 0;
+    }
+    return 0;
+  }
+  ptrdiff_t __f = __atoms_offset(__atoms, __ct);
+  if (__f >= 24)
+    return -1;
+  switch (__base) {
+  case 8:
+  case 10:
+    if (__f >= __base)
+      return -1;
+    break;
+  case 16:
+    if (__f < 22)
+      break;
+    if (__a_end != __a && __a_end - __a <= 2 && __a_end[-1] == '0') {
+      __dc       = 0;
+      *__a_end++ = __src[__f];
+      return 0;
+    }
+    return -1;
+  }
+  *__a_end++ = __src[__f];
+  ++__dc;
+  return 0;
+}
+
 template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS collate<char>;
 _LIBCPP_IF_WIDE_CHARACTERS(template class _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS collate<wchar_t>;)
 
diff --git a/libcxx/test/libcxx/diagnostics/string_view.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/string_view.nodiscard.verify.cpp
index e5b2258315fe4..89e4a5b44ab48 100644
--- a/libcxx/test/libcxx/diagnostics/string_view.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/string_view.nodiscard.verify.cpp
@@ -12,12 +12,140 @@
 
 #include <string_view>
 
+#include "type_algorithms.h"
 #include "test_macros.h"
 
-void test() {
-  std::string_view string_view;
-  string_view.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+void test_members() {
+  std::string_view sv;
+
+  sv.begin();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.end();     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.cbegin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.cend();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.rbegin();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.rend();    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.crbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.crend();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv.size();     // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.length();   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.max_size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv[0];    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.at(0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.back();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.data();  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv.substr(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 #if TEST_STD_VER >= 26
-  string_view.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.compare(sv);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.compare(0, 0, sv);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.compare(0, 0, sv, 0, 0);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.compare("");
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.compare(0, 0, "");
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.compare(0, 0, "", 0);
+
+  sv.find(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find("", 0);    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv.rfind(sv);       // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.rfind(' ');      // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.rfind("", 0, 0); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.rfind("", 0);    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_of(sv);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_of(' ');
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_of("", 0, 0);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_of("", 0);
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_of(sv);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_of(' ');
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_of("", 0, 0);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_of("", 0);
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_not_of(sv);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_not_of(' ');
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_not_of("", 0, 0);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_first_not_of("", 0);
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_not_of(sv);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_not_of(' ');
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_not_of("", 0, 0);
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.find_last_not_of("", 0);
+
+#if TEST_STD_VER >= 20
+  sv.starts_with(sv);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.starts_with(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.starts_with("");  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  sv.ends_with(sv);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.ends_with(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.ends_with("");  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+
+#if TEST_STD_VER >= 23
+  sv.contains(sv);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.contains(' '); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  sv.contains("");  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+#endif
+}
+
+void test_nonmembers() {
+  // std::hash<>
+
+  std::hash<std::string_view> hash;
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  hash(std::string_view{});
+
+#if TEST_STD_VER >= 14
+  // string_view literals
+
+  using namespace std::string_view_literals;
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ""sv; // const char*
+#  if !defined(TEST_HAS_NO_WIDE_CHARACTERS)
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  L""sv; // const wchar_t*
+#  endif
+#  if !defined(TEST_HAS_NO_CHAR8_T)
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  u8""sv; // const char8_t*
+#  endif
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  u""sv; // const char16_t*
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  U""sv; // const char32_t*
 #endif
 }
diff --git a/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp b/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp
index 316c9828e0de5..ffe048730e687 100644
--- a/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp
+++ b/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp
@@ -10,8 +10,10 @@
 
 // Ensure that APIs which take a CharT* are diagnosing passing a nullptr to them
 
-// Clang 19 and AppleClang don't have diagnose_if with diagnostic flags
-// UNSUPPORTED: clang-19, apple-clang-17
+// AppleClang doesn't have diagnose_if with diagnostic flags
+// UNSUPPORTED: apple-clang-17
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-unused-result
 
 #include <string_view>
 
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
index 8df89d0ba9206..900ca0e5e1c5e 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ccomplex.verify.cpp
@@ -14,6 +14,11 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
+// FIXME: using `#warning` causes diagnostics from system headers which include deprecated headers. This can only be
+// enabled again once https://github.com/llvm/llvm-project/pull/168041 (or a similar feature) has landed, since that
+// allows suppression in system headers.
+// XFAIL: *
+
 #include <ccomplex>
 
 // expected-warning@ccomplex:* {{<ccomplex> is deprecated in C++17 and removed in C++20. Include <complex> instead.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
index 32b57033331c8..a1ca842bc62ab 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ciso646.verify.cpp
@@ -14,6 +14,11 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: clang-modules-build
 
+// FIXME: using `#warning` causes diagnostics from system headers which include deprecated headers. This can only be
+// enabled again once https://github.com/llvm/llvm-project/pull/168041 (or a similar feature) has landed, since that
+// allows suppression in system headers.
+// XFAIL: *
+
 #include <ciso646>
 
 // expected-warning@ciso646:* {{<ciso646> is removed in C++20. Include <version> instead.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
index 23a7709a9d658..503a87658ac02 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/cstdalign.verify.cpp
@@ -14,6 +14,11 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
+// FIXME: using `#warning` causes diagnostics from system headers which include deprecated headers. This can only be
+// enabled again once https://github.com/llvm/llvm-project/pull/168041 (or a similar feature) has landed, since that
+// allows suppression in system headers.
+// XFAIL: *
+
 #include <cstdalign>
 
 // expected-warning@cstdalign:* {{<cstdalign> is deprecated in C++17 and removed in C++20.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
index c2c0f03c52d3c..80025c5ab72d2 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/cstdbool.verify.cpp
@@ -14,6 +14,11 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
+// FIXME: using `#warning` causes diagnostics from system headers which include deprecated headers. This can only be
+// enabled again once https://github.com/llvm/llvm-project/pull/168041 (or a similar feature) has landed, since that
+// allows suppression in system headers.
+// XFAIL: *
+
 #include <cstdbool>
 
 // expected-warning@cstdbool:* {{<cstdbool> is deprecated in C++17 and removed in C++20.}}
diff --git a/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp b/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
index 4f5564915443d..07bdd29648a68 100644
--- a/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
+++ b/libcxx/test/std/depr/depr.cpp.headers/ctgmath.verify.cpp
@@ -14,6 +14,11 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: clang-modules-build
 
+// FIXME: using `#warning` causes diagnostics from system headers which include deprecated headers. This can only be
+// enabled again once https://github.com/llvm/llvm-project/pull/168041 (or a similar feature) has landed, since that
+// allows suppression in system headers.
+// XFAIL: *
+
 #include <ctgmath>
 
 // expected-warning@ctgmath:* {{<ctgmath> is deprecated in C++17 and removed in C++20. Include <cmath> and <complex> instead.}}
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
index 015408294bc8c..a110aae2db11b 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
@@ -13,6 +13,8 @@
 // iter_type get(iter_type in, iter_type end, ios_base&,
 //               ios_base::iostate& err, long& v) const;
 
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
 #include <locale>
 #include <ios>
 #include <cassert>
@@ -98,6 +100,18 @@ int main(int, char**)
         assert(err == ios.goodbit);
         assert(v == 291);
     }
+    {
+        const char str[] = "a123";
+        std::dec(ios);
+        std::ios_base::iostate err = ios.goodbit;
+        cpp17_input_iterator<const char*> iter =
+            f.get(cpp17_input_iterator<const char*>(str),
+                  cpp17_input_iterator<const char*>(str+sizeof(str)),
+                  ios, err, v);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0);
+    }
     {
         const char str[] = "0x123";
         std::hex(ios);
@@ -519,6 +533,142 @@ int main(int, char**)
         assert(err == ios.failbit);
         assert(v == std::numeric_limits<long>::max());
     }
+    {
+      v                          = -1;
+      const char str[]           = "";
+      std::ios_base::iostate err = ios.goodbit;
+
+      cpp17_input_iterator<const char*> iter =
+          f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == (std::ios::eofbit | std::ios::failbit));
+      assert(v == 0);
+    }
+    {
+      v                          = -1;
+      const char str[]           = "+";
+      std::ios_base::iostate err = ios.goodbit;
+
+      cpp17_input_iterator<const char*> iter =
+          f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 1), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == (std::ios::eofbit | std::ios::failbit));
+      assert(v == 0);
+    }
+    {
+      v                          = -1;
+      const char str[]           = "+";
+      std::ios_base::iostate err = ios.goodbit;
+
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(std::begin(str)),
+          cpp17_input_iterator<const char*>(std::end(str)),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0);
+    }
+    {
+      v                          = -1;
+      const char str[]           = "-";
+      std::ios_base::iostate err = ios.goodbit;
+
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(std::begin(str)),
+          cpp17_input_iterator<const char*>(std::end(str)),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0);
+    }
+    {
+      v                          = -1;
+      const char str[]           = "0";
+      std::ios_base::iostate err = ios.goodbit;
+
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(std::begin(str)),
+          cpp17_input_iterator<const char*>(std::end(str)),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.goodbit);
+      assert(v == 0);
+    }
+    {
+      v                          = -1;
+      const char str[]           = "078";
+      std::ios_base::iostate err = ios.goodbit;
+
+      ios.flags(ios.flags() & ~ios.basefield);
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(std::begin(str)),
+          cpp17_input_iterator<const char*>(std::end(str)),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str + 2);
+      assert(err == ios.goodbit);
+      assert(v == 7);
+      ios.flags(ios.flags() | ios.dec);
+    }
+    {
+      v                          = -1;
+      std::string str = std::to_string(std::numeric_limits<unsigned long>::max()) + "99a";
+      std::ios_base::iostate err = ios.goodbit;
+
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str.data()),
+          cpp17_input_iterator<const char*>(str.data() + str.size()),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str.data() + str.size() - 1);
+      assert(err == ios.failbit);
+      assert(v == std::numeric_limits<long>::max());
+    }
+    {
+        std::string str = std::to_string(std::numeric_limits<long>::max()) + 'c';
+        std::ios_base::iostate err = ios.goodbit;
+        cpp17_input_iterator<const char*> iter =
+            f.get(cpp17_input_iterator<const char*>(str.data()),
+                  cpp17_input_iterator<const char*>(str.data() + str.size()),
+                  ios, err, v);
+        assert(base(iter) == str.data() + str.size() - 1);
+        assert(err == ios.goodbit);
+        assert(v == std::numeric_limits<long>::max());
+    }
+    {
+      std::string str = std::to_string(static_cast<unsigned long>(std::numeric_limits<long>::max()) + 1) + 'c';
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str.data()),
+          cpp17_input_iterator<const char*>(str.data() + str.size()),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str.data() + str.size() - 1);
+      assert(err == ios.failbit);
+      assert(v == std::numeric_limits<long>::max());
+    }
+    {
+      std::string str = '-' + std::to_string(static_cast<unsigned long>(std::numeric_limits<long>::max()) + 2) + 'c';
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str.data()),
+          cpp17_input_iterator<const char*>(str.data() + str.size()),
+          ios,
+          err,
+          v);
+      assert(base(iter) == str.data() + str.size() - 1);
+      assert(err == ios.failbit);
+      assert(v == std::numeric_limits<long>::min());
+    }
 
   return 0;
 }
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_int.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_int.pass.cpp
index bee1be08baafc..f9cef08e247d0 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_int.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_int.pass.cpp
@@ -68,6 +68,17 @@ int main(int, char**)
         assert(err == ios.goodbit);
         assert(v == 1);
     }
+    {
+        const char str[] = "-1";
+        std::ios_base::iostate err = ios.goodbit;
+        cpp17_input_iterator<const char*> iter =
+            f.get(cpp17_input_iterator<const char*>(str),
+                  cpp17_input_iterator<const char*>(str+sizeof(str)),
+                  ios, err, v);
+        assert(base(iter) == str+sizeof(str)-1);
+        assert(err == ios.goodbit);
+        assert(v == std::numeric_limits<unsigned int>::max());
+    }
     std::hex(ios);
     {
         const char str[] = "0xFFFFFFFF";
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long.pass.cpp
index b087bdcd94017..fed6fc0246d82 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long.pass.cpp
@@ -68,6 +68,17 @@ int main(int, char**)
         assert(err == ios.goodbit);
         assert(v == 1);
     }
+    {
+        const char str[] = "-1";
+        std::ios_base::iostate err = ios.goodbit;
+        cpp17_input_iterator<const char*> iter =
+            f.get(cpp17_input_iterator<const char*>(str),
+                  cpp17_input_iterator<const char*>(str+sizeof(str)),
+                  ios, err, v);
+        assert(base(iter) == str+sizeof(str)-1);
+        assert(err == ios.goodbit);
+        assert(v == std::numeric_limits<unsigned long>::max());
+    }
     std::hex(ios);
     {
         const char str[] = "0xFFFFFFFF";
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long_long.pass.cpp
index 6769aebe424e3..0bdb6c1c38606 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long_long.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_long_long.pass.cpp
@@ -68,6 +68,17 @@ int main(int, char**)
         assert(err == ios.goodbit);
         assert(v == 1);
     }
+    {
+        const char str[] = "-1";
+        std::ios_base::iostate err = ios.goodbit;
+        cpp17_input_iterator<const char*> iter =
+            f.get(cpp17_input_iterator<const char*>(str),
+                  cpp17_input_iterator<const char*>(str+sizeof(str)),
+                  ios, err, v);
+        assert(base(iter) == str+sizeof(str)-1);
+        assert(err == ios.goodbit);
+        assert(v == std::numeric_limits<unsigned long long>::max());
+    }
     std::hex(ios);
     {
         const char str[] = "0xFFFFFFFFFFFFFFFF";
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_short.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_short.pass.cpp
index bec9a7ff9e3bb..decfbe943461f 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_short.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_unsigned_short.pass.cpp
@@ -68,6 +68,17 @@ int main(int, char**)
         assert(err == ios.goodbit);
         assert(v == 1);
     }
+    {
+        const char str[] = "-1";
+        std::ios_base::iostate err = ios.goodbit;
+        cpp17_input_iterator<const char*> iter =
+            f.get(cpp17_input_iterator<const char*>(str),
+                  cpp17_input_iterator<const char*>(str+sizeof(str)),
+                  ios, err, v);
+        assert(base(iter) == str+sizeof(str)-1);
+        assert(err == ios.goodbit);
+        assert(v == std::numeric_limits<unsigned short>::max());
+    }
     std::hex(ios);
     {
         const char str[] = "0xFFFF";
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index 8b77a06323e3d..2ac69c38ebffa 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -33,64 +33,63 @@ definitions:
       - "**/CMakeOutput.log"
 
 steps:
-# Linaro's ARM builders are temporarily offline.
-#- group: ARM
-#  steps:
-#  - label: AArch64
-#    command: libcxx/utils/ci/run-buildbot aarch64
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: aarch64
-#    <<: *common
-#
-#  - label: AArch64 -fno-exceptions
-#    command: libcxx/utils/ci/run-buildbot aarch64-no-exceptions
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: aarch64
-#    <<: *common
-#
-#  - label: Armv8
-#    command: libcxx/utils/ci/run-buildbot armv8
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: armv8l
-#    <<: *common
-#
-#  - label: Armv8 -fno-exceptions
-#    command: libcxx/utils/ci/run-buildbot armv8-no-exceptions
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: armv8l
-#    <<: *common
-#
-#  - label: Armv7
-#    command: libcxx/utils/ci/run-buildbot armv7
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: armv8l
-#    <<: *common
-#
-#  - label: Armv7 -fno-exceptions
-#    command: libcxx/utils/ci/run-buildbot armv7-no-exceptions
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: armv8l
-#    <<: *common
-#
-#  - label: Armv7-M picolibc
-#    command: libcxx/utils/ci/run-buildbot armv7m-picolibc
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: aarch64
-#    <<: *common
-#
-#  - label: Armv7-M picolibc -fno-exceptions
-#    command: libcxx/utils/ci/run-buildbot armv7m-picolibc-no-exceptions
-#    agents:
-#      queue: libcxx-builders-linaro-arm
-#      arch: aarch64
-#    <<: *common
+- group: ARM
+  steps:
+  - label: AArch64
+    command: libcxx/utils/ci/run-buildbot aarch64
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: aarch64
+    <<: *common
+
+  - label: AArch64 -fno-exceptions
+    command: libcxx/utils/ci/run-buildbot aarch64-no-exceptions
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: aarch64
+    <<: *common
+
+  - label: Armv8
+    command: libcxx/utils/ci/run-buildbot armv8
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: armv8l
+    <<: *common
+
+  - label: Armv8 -fno-exceptions
+    command: libcxx/utils/ci/run-buildbot armv8-no-exceptions
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: armv8l
+    <<: *common
+
+  - label: Armv7
+    command: libcxx/utils/ci/run-buildbot armv7
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: armv8l
+    <<: *common
+
+  - label: Armv7 -fno-exceptions
+    command: libcxx/utils/ci/run-buildbot armv7-no-exceptions
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: armv8l
+    <<: *common
+
+  - label: Armv7-M picolibc
+    command: libcxx/utils/ci/run-buildbot armv7m-picolibc
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: aarch64
+    <<: *common
+
+  - label: Armv7-M picolibc -fno-exceptions
+    command: libcxx/utils/ci/run-buildbot armv7m-picolibc-no-exceptions
+    agents:
+      queue: libcxx-builders-linaro-arm
+      arch: aarch64
+    <<: *common
 
 - group: AIX
   steps:
diff --git a/libsycl/Maintainers.md b/libsycl/Maintainers.md
new file mode 100644
index 0000000000000..4ffc9e87d3bdd
--- /dev/null
+++ b/libsycl/Maintainers.md
@@ -0,0 +1,13 @@
+# libsycl Maintainers
+
+This file is a list of the
+[maintainers](https://llvm.org/docs/DeveloperPolicy.html#maintainers) for
+the SYCL Runtime library.
+
+# Current Maintainers
+
+Alexey Bader \
+| alexey.bader@intel.com (email), bader (GitHub, Discord, Discourse)
+
+Kseniya Tikhomirova \
+| kseniya.tikhomirova@intel.com (email), KseniyaTikhomirova (GitHub, Discourse)
\ No newline at end of file
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 987586b97dfdc..b6b073a96bcad 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -63,7 +63,7 @@ static void DumpUnwindRowsToLog(Log *log, AddressRange range,
 }
 
 static void DumpInstToLog(Log *log, Instruction &inst,
-                          InstructionList inst_list) {
+                          const InstructionList &inst_list) {
   if (!log || !log->GetVerbose())
     return;
   const bool show_address = true;
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 0b77e0a4929a7..48107717abd31 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -477,18 +477,17 @@ bool AddPythonDLLToSearchPath() {
 #endif
 
 #ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
-/// Returns whether `python3x.dll` is in the DLL search path.
+/// Returns true if `python3x.dll` can be loaded.
 bool IsPythonDLLInPath() {
 #define WIDEN2(x) L##x
 #define WIDEN(x) WIDEN2(x)
-  WCHAR foundPath[MAX_PATH];
-  DWORD result =
-      SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr,
-                  MAX_PATH, foundPath, nullptr);
+  HMODULE h = LoadLibraryW(WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME));
+  if (!h)
+    return false;
+  FreeLibrary(h);
+  return true;
 #undef WIDEN2
 #undef WIDEN
-
-  return result > 0;
 }
 #endif
 
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index b0d4e46bb508f..2f897332f40c9 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -511,6 +511,8 @@ Some subprojects maintain their own list of per-component maintainers.
 
 [libclc maintainers](https://github.com/llvm/llvm-project/blob/main/libclc/Maintainers.md)
 
+[libsycl maintainers](https://github.com/llvm/llvm-project/blob/main/libsycl/Maintainers.md)
+
 [LLD maintainers](https://github.com/llvm/llvm-project/blob/main/lld/Maintainers.md)
 
 [LLDB maintainers](https://github.com/llvm/llvm-project/blob/main/lldb/Maintainers.md)
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst
index 6aec05f29329a..ee12d983dd99e 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst
@@ -453,7 +453,7 @@ starts with:
           return LHS;
 
 This code gets the precedence of the current token and checks to see if
-if is too low. Because we defined invalid tokens to have a precedence of
+it is too low. Because we defined invalid tokens to have a precedence of
 -1, this check implicitly knows that the pair-stream ends when the token
 stream runs out of binary operators. If this check succeeds, we know
 that the token is a binary operator and that it will be included in this
diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h
index 5b9535380aebf..d6f5d926f022c 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h
@@ -708,7 +708,11 @@ class LLVM_ABI DWARFLinker : public DWARFLinkerBase {
     /// already there.
     /// \returns is a name was found.
     bool getDIENames(const DWARFDie &Die, AttributesInfo &Info,
-                     OffsetsStringPool &StringPool, bool StripTemplate = false);
+                     OffsetsStringPool &StringPool, const DWARFFile &File,
+                     CompileUnit &Unit, bool StripTemplate = false);
+
+    llvm::StringRef getCanonicalDIEName(DWARFDie Die, const DWARFFile &File,
+                                        CompileUnit *Unit);
 
     uint32_t hashFullyQualifiedName(DWARFDie DIE, CompileUnit &U,
                                     const DWARFFile &File,
diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h
index 9fb1b3f80e2ff..5ced6d05cc231 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinkerDeclContext.h
@@ -84,11 +84,13 @@ class DeclContext {
   DeclContext() : DefinedInClangModule(0), Parent(*this) {}
 
   DeclContext(unsigned Hash, uint32_t Line, uint32_t ByteSize, uint16_t Tag,
-              StringRef Name, StringRef File, const DeclContext &Parent,
-              DWARFDie LastSeenDIE = DWARFDie(), unsigned CUId = 0)
+              StringRef Name, StringRef NameForUniquing, StringRef File,
+              const DeclContext &Parent, DWARFDie LastSeenDIE = DWARFDie(),
+              unsigned CUId = 0)
       : QualifiedNameHash(Hash), Line(Line), ByteSize(ByteSize), Tag(Tag),
-        DefinedInClangModule(0), Name(Name), File(File), Parent(Parent),
-        LastSeenDIE(LastSeenDIE), LastSeenCompileUnitID(CUId) {}
+        DefinedInClangModule(0), Name(Name), NameForUniquing(NameForUniquing),
+        File(File), Parent(Parent), LastSeenDIE(LastSeenDIE),
+        LastSeenCompileUnitID(CUId) {}
 
   uint32_t getQualifiedNameHash() const { return QualifiedNameHash; }
 
@@ -100,6 +102,7 @@ class DeclContext {
 
   uint32_t getCanonicalDIEOffset() const { return CanonicalDIEOffset; }
   void setCanonicalDIEOffset(uint32_t Offset) { CanonicalDIEOffset = Offset; }
+  llvm::StringRef getCanonicalName() const { return Name; }
 
   bool isDefinedInClangModule() const { return DefinedInClangModule; }
   void setDefinedInClangModule(bool Val) { DefinedInClangModule = Val; }
@@ -115,6 +118,7 @@ class DeclContext {
   uint16_t Tag = dwarf::DW_TAG_compile_unit;
   unsigned DefinedInClangModule : 1;
   StringRef Name;
+  StringRef NameForUniquing;
   StringRef File;
   const DeclContext &Parent;
   DWARFDie LastSeenDIE;
@@ -180,7 +184,7 @@ struct DeclMapInfo : private DenseMapInfo<DeclContext *> {
       return RHS == LHS;
     return LHS->QualifiedNameHash == RHS->QualifiedNameHash &&
            LHS->Line == RHS->Line && LHS->ByteSize == RHS->ByteSize &&
-           LHS->Name.data() == RHS->Name.data() &&
+           LHS->NameForUniquing.data() == RHS->NameForUniquing.data() &&
            LHS->File.data() == RHS->File.data() &&
            LHS->Parent.QualifiedNameHash == RHS->Parent.QualifiedNameHash;
   }
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index e624088a0964e..ce933c5f1be85 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -307,7 +307,13 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate {
   /// number if it is not zero. If DstReg is a physical register and the
   /// existing subregister number of the def / use being updated is not zero,
   /// make sure to set it to the correct physical subregister.
-  void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+  ///
+  /// If \p SubregToRegSrcInsts is not empty, we are coalescing a
+  /// `DstReg = SUBREG_TO_REG SrcReg`, which should introduce an
+  /// implicit-def of DstReg on instructions that define SrcReg.
+  void updateRegDefsUses(
+      Register SrcReg, Register DstReg, unsigned SubIdx,
+      SmallPtrSetImpl<MachineInstr *> *SubregToRegSrcInsts = nullptr);
 
   /// If the given machine operand reads only undefined lanes add an undef
   /// flag.
@@ -1444,6 +1450,7 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
 
   // CopyMI may have implicit operands, save them so that we can transfer them
   // over to the newly materialized instruction after CopyMI is removed.
+  LaneBitmask NewMIImplicitOpsMask;
   SmallVector<MachineOperand, 4> ImplicitOps;
   ImplicitOps.reserve(CopyMI->getNumOperands() -
                       CopyMI->getDesc().getNumOperands());
@@ -1458,6 +1465,9 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
               (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) &&
              "unexpected implicit virtual register def");
       ImplicitOps.push_back(MO);
+      if (MO.isDef() && MO.getReg().isVirtual() &&
+          MRI->shouldTrackSubRegLiveness(DstReg))
+        NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg());
     }
   }
 
@@ -1494,14 +1504,11 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
       } else {
         assert(MO.getReg() == NewMI.getOperand(0).getReg());
 
-        // We're only expecting another def of the main output, so the range
-        // should get updated with the regular output range.
-        //
-        // FIXME: The range updating below probably needs updating to look at
-        // the super register if subranges are tracked.
-        assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
-               "subrange update for implicit-def of super register may not be "
-               "properly handled");
+        // If lanemasks need to be tracked, compile the lanemask of the NewMI
+        // implicit def operands to avoid subranges for the super-regs from
+        // being removed by code later on in this function.
+        if (MRI->shouldTrackSubRegLiveness(MO.getReg()))
+          NewMIImplicitOpsMask |= MRI->getMaxLaneMaskForVReg(MO.getReg());
       }
     }
   }
@@ -1617,7 +1624,8 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
           *LIS->getSlotIndexes(), *TRI);
 
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
-        if ((SR.LaneMask & DstMask).none()) {
+        if ((SR.LaneMask & DstMask).none() &&
+            (SR.LaneMask & NewMIImplicitOpsMask).none()) {
           LLVM_DEBUG(dbgs()
                      << "Removing undefined SubRange "
                      << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
@@ -1891,11 +1899,14 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
   }
 }
 
-void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
-                                          unsigned SubIdx) {
+void RegisterCoalescer::updateRegDefsUses(
+    Register SrcReg, Register DstReg, unsigned SubIdx,
+    SmallPtrSetImpl<MachineInstr *> *SubregToRegSrcInsts) {
   bool DstIsPhys = DstReg.isPhysical();
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
+  // Coalescing a COPY may expose reads of 'undef' subregisters.
+  // If so, then explicitly propagate 'undef' to those operands.
   if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
     for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
       if (MO.isUndef())
@@ -1912,6 +1923,15 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     }
   }
 
+  // If DstInt already has a subrange for the unused lanes, then we shouldn't
+  // create duplicate subranges when we update the interval for unused lanes.
+  LaneBitmask DstIntLaneMask;
+  if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) {
+    for (LiveInterval::SubRange &SR : DstInt->subranges())
+      DstIntLaneMask |= SR.LaneMask;
+  }
+
+  // Go through all instructions to replace uses of 'SrcReg' by 'DstReg'.
   SmallPtrSet<MachineInstr *, 8> Visited;
   for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg),
                                                E = MRI->reg_instr_end();
@@ -1935,6 +1955,82 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
+    bool RequiresImplicitRedef = false;
+    if (SubregToRegSrcInsts && !SubregToRegSrcInsts->empty()) {
+      // We can only add an implicit-def and undef if the sub registers match,
+      // e.g.
+      //  %0:gr32      = INSTX
+      //  %0.sub8:gr32 = INSTY           // top 24 bits of %0 still defined
+      //  %1:gr64      = SUBREG_TO_REG 0, %0, %subreg.sub32
+      //
+      // This cannot be transformed into:
+      //  %1.sub32:gr64      = INSTX
+      //  undef %1.sub8:gr64 = INSTY , implicit-def %1
+      //
+      // because the undef means that none of the bits of %1 are read, thus
+      // thrashing the top 24 bits of %1.sub32.
+      if (SubregToRegSrcInsts->contains(UseMI) &&
+          all_of(UseMI->all_defs(),
+                 [&SubIdx, &SrcReg](const MachineOperand &MO) -> bool {
+                   if (MO.getReg() != SrcReg) // Ignore unrelated registers
+                     return true;
+                   return MO.isUndef() ||
+                          (SubIdx &&
+                           (!MO.getSubReg() || SubIdx == MO.getSubReg()));
+                 })) {
+        // Add implicit-def of super-register to express that the whole
+        // register is defined by the instruction.
+        UseMI->addRegisterDefined(DstReg);
+        RequiresImplicitRedef = true;
+      }
+
+      // If the coalesed instruction doesn't fully define the register, we need
+      // to preserve the original super register liveness for SUBREG_TO_REG.
+      //
+      // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
+      // but it introduces liveness for other subregisters. Downstream users may
+      // have been relying on those bits, so we need to ensure their liveness is
+      // captured with a def of other lanes.
+      if (DstInt && MRI->shouldTrackSubRegLiveness(DstReg)) {
+        // First check if there is sufficient granularity in terms of subranges.
+        LaneBitmask DstMask = MRI->getMaxLaneMaskForVReg(DstInt->reg());
+        LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+        LaneBitmask UnusedLanes = DstMask & ~UsedLanes;
+        if ((UnusedLanes & ~DstIntLaneMask).any()) {
+          BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+          DstInt->createSubRangeFrom(Allocator, UnusedLanes, *DstInt);
+          DstIntLaneMask |= UnusedLanes;
+        }
+
+        // After duplicating the live ranges for the low/hi bits, we
+        // need to update the subranges of the DstReg interval such that
+        // for a case like this:
+        //
+        //       entry:
+        //  16B    %1:gpr32 = INSTRUCTION    (<=> UseMI)
+        //            :
+        //       if.then:
+        //  32B    %1:gpr32 = MOVIMM32 ..
+        //  48B    %0:gpr64 = SUBREG_TO_REG 0, %1, sub32
+        //
+        //  Only the MOVIMM32 requires a def of the top lanes and any intervals
+        //  for the top 32-bits of the def at 16B should be removed.
+        for (LiveInterval::SubRange &SR : DstInt->subranges()) {
+          if (!Writes || RequiresImplicitRedef ||
+              (SR.LaneMask & UnusedLanes).none())
+            continue;
+
+          assert((SR.LaneMask & UnusedLanes) == SR.LaneMask &&
+                 "Unexpected lanemask. Subrange needs finer granularity");
+
+          SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot();
+          auto SegmentI = SR.find(UseIdx);
+          if (SegmentI != SR.end())
+            SR.removeSegment(SegmentI, true);
+        }
+      }
+    }
+
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned Op : Ops) {
       MachineOperand &MO = UseMI->getOperand(Op);
@@ -1943,7 +2039,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
       // turn a full def into a read-modify-write sub-register def and vice
       // versa.
       if (SubIdx && MO.isDef())
-        MO.setIsUndef(!Reads);
+        MO.setIsUndef(!Reads || RequiresImplicitRedef);
 
       // A subreg use of a partially undef (super) register may be a complete
       // undef use now and then has to be marked that way.
@@ -2046,6 +2142,38 @@ void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
   LIS->shrinkToUses(&LI);
 }
 
+/// For a given use of value \p Idx, it returns the def in the current block,
+/// or otherwise all possible defs in preceding blocks.
+static bool findPrecedingDefs(SmallPtrSetImpl<MachineInstr *> &Instrs,
+                              LiveIntervals *LIS, LiveInterval &SrcInt,
+                              MachineBasicBlock *MBB, VNInfo *Idx) {
+  auto IsPrecedingDef = [&](VNInfo *Idx) -> bool {
+    if (Idx->isPHIDef())
+      return false;
+    MachineInstr *Def = LIS->getInstructionFromIndex(Idx->def);
+    assert(Def && "Unable to find a def for SUBREG_TO_REG source operand");
+    Instrs.insert(Def);
+    return true;
+  };
+
+  if (IsPrecedingDef(Idx))
+    return true;
+
+  SmallVector<MachineBasicBlock *> Worklist(MBB->pred_begin(), MBB->pred_end());
+  SmallPtrSet<MachineBasicBlock *, 8> VisitedBlocks;
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    auto [_, Inserted] = VisitedBlocks.insert(MBB);
+    if (!Inserted)
+      continue;
+    VNInfo *Idx = SrcInt.getVNInfoBefore(LIS->getMBBEndIdx(MBB));
+    if (!IsPrecedingDef(Idx))
+      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+  }
+
+  return !Instrs.empty();
+}
+
 bool RegisterCoalescer::joinCopy(
     MachineInstr *CopyMI, bool &Again,
     SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs) {
@@ -2183,6 +2311,34 @@ bool RegisterCoalescer::joinCopy(
     });
   }
 
+  SmallPtrSet<MachineInstr *, 4> SubregToRegSrcInsts;
+  Register SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  if (CopyMI->isSubregToReg() && !SrcReg.isPhysical()) {
+    // For the case where the copy instruction is a SUBREG_TO_REG, e.g.
+    //
+    //   %0:gpr32 = movimm32 ..
+    //   %1:gpr64 = SUBREG_TO_REG 0, %0, sub32
+    //   ...
+    //   %0:gpr32 = COPY <something>
+    //
+    // After joining liveranges, the original `movimm32` will need an
+    // implicit-def to make it explicit that the entire register is written,
+    // i.e.
+    //
+    //   undef %0.sub32:gpr64 = movimm32 ..., implicit-def %0
+    //   ...
+    //   undef %0.sub32:gpr64 = COPY <something>  // Note that this does not
+    //                                            // require an implicit-def,
+    //                                            // because it has nothing to
+    //                                            // do with the SUBREG_TO_REG.
+    LiveInterval &SrcInt = LIS->getInterval(SrcReg);
+    SlotIndex SubregToRegSlotIdx = LIS->getInstructionIndex(*CopyMI);
+    if (!findPrecedingDefs(SubregToRegSrcInsts, LIS, SrcInt,
+                           CopyMI->getParent(),
+                           SrcInt.Query(SubregToRegSlotIdx).valueIn()))
+      llvm_unreachable("SUBREG_TO_REG src requires a def");
+  }
+
   ShrinkMask = LaneBitmask::getNone();
   ShrinkMainRange = false;
 
@@ -2253,7 +2409,8 @@ bool RegisterCoalescer::joinCopy(
   // Also update DstReg operands to include DstIdx if it is set.
   if (CP.getDstIdx())
     updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
-  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
+                    &SubregToRegSrcInsts);
 
   // Shrink subregister ranges if necessary.
   if (ShrinkMask.any()) {
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 8ec4bfbb5a330..cf064b90a7d34 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -448,7 +448,7 @@ void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) {
     const MachineInstr *DefMI = LIS.getInstructionFromIndex(Def);
     assert(DefMI != nullptr);
     LaneBitmask LM;
-    for (const MachineOperand &DefOp : DefMI->defs()) {
+    for (const MachineOperand &DefOp : DefMI->all_defs()) {
       Register R = DefOp.getReg();
       if (R != LI.reg())
         continue;
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 8637b55c78f9c..daf3788639451 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -151,22 +151,84 @@ static bool isTypeTag(uint16_t Tag) {
   return false;
 }
 
-bool DWARFLinker::DIECloner::getDIENames(const DWARFDie &Die,
-                                         AttributesInfo &Info,
-                                         OffsetsStringPool &StringPool,
-                                         bool StripTemplate) {
+/// Recurse through the input DIE's canonical references until we find a
+/// DW_AT_name.
+llvm::StringRef
+DWARFLinker::DIECloner::getCanonicalDIEName(DWARFDie Die, const DWARFFile &File,
+                                            CompileUnit *Unit) {
+  if (!Die)
+    return {};
+
+  std::optional<DWARFFormValue> Ref;
+
+  auto GetDieName = [](const DWARFDie &D) -> llvm::StringRef {
+    auto NameForm = D.find(llvm::dwarf::DW_AT_name);
+    if (!NameForm)
+      return {};
+
+    auto NameOrErr = NameForm->getAsCString();
+    if (!NameOrErr) {
+      llvm::consumeError(NameOrErr.takeError());
+      return {};
+    }
+
+    return *NameOrErr;
+  };
+
+  llvm::StringRef Name = GetDieName(Die);
+  if (!Name.empty())
+    return Name;
+
+  while (true) {
+    if (!(Ref = Die.find(llvm::dwarf::DW_AT_specification)) &&
+        !(Ref = Die.find(llvm::dwarf::DW_AT_abstract_origin)))
+      break;
+
+    Die = Linker.resolveDIEReference(File, CompileUnits, *Ref, Die, Unit);
+    if (!Die)
+      break;
+
+    assert(Unit);
+
+    unsigned SpecIdx = Unit->getOrigUnit().getDIEIndex(Die);
+    CompileUnit::DIEInfo &SpecInfo = Unit->getInfo(SpecIdx);
+    if (SpecInfo.Ctxt && SpecInfo.Ctxt->hasCanonicalDIE()) {
+      if (!SpecInfo.Ctxt->getCanonicalName().empty()) {
+        Name = SpecInfo.Ctxt->getCanonicalName();
+        break;
+      }
+    }
+
+    Name = GetDieName(Die);
+    if (!Name.empty())
+      break;
+  }
+
+  return Name;
+}
+
+bool DWARFLinker::DIECloner::getDIENames(
+    const DWARFDie &Die, AttributesInfo &Info, OffsetsStringPool &StringPool,
+    const DWARFFile &File, CompileUnit &Unit, bool StripTemplate) {
   // This function will be called on DIEs having low_pcs and
   // ranges. As getting the name might be more expansive, filter out
   // blocks directly.
   if (Die.getTag() == dwarf::DW_TAG_lexical_block)
     return false;
 
+  // The mangled name of an specification DIE will by virtue of the
+  // uniquing algorithm be the same as the one it got uniqued into.
+  // So just use the input DIE's linkage name.
   if (!Info.MangledName)
     if (const char *MangledName = Die.getLinkageName())
       Info.MangledName = StringPool.getEntry(MangledName);
 
+  // For subprograms with linkage names, we unique on the linkage name,
+  // so DW_AT_name's may differ between the input and canonical DIEs.
+  // Use the name of the canonical DIE.
   if (!Info.Name)
-    if (const char *Name = Die.getShortName())
+    if (llvm::StringRef Name = getCanonicalDIEName(Die, File, &Unit);
+        !Name.empty())
       Info.Name = StringPool.getEntry(Name);
 
   if (!Info.MangledName)
@@ -1939,7 +2001,7 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
   // accelerator tables too. For now stick with dsymutil's behavior.
   if ((Info.InDebugMap || AttrInfo.HasLowPc || AttrInfo.HasRanges) &&
       Tag != dwarf::DW_TAG_compile_unit &&
-      getDIENames(InputDIE, AttrInfo, DebugStrPool,
+      getDIENames(InputDIE, AttrInfo, DebugStrPool, File, Unit,
                   Tag != dwarf::DW_TAG_inlined_subroutine)) {
     if (AttrInfo.MangledName && AttrInfo.MangledName != AttrInfo.Name)
       Unit.addNameAccelerator(Die, AttrInfo.MangledName,
@@ -1962,7 +2024,7 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
   } else if (Tag == dwarf::DW_TAG_imported_declaration && AttrInfo.Name) {
     Unit.addNamespaceAccelerator(Die, AttrInfo.Name);
   } else if (isTypeTag(Tag) && !AttrInfo.IsDeclaration) {
-    bool Success = getDIENames(InputDIE, AttrInfo, DebugStrPool);
+    bool Success = getDIENames(InputDIE, AttrInfo, DebugStrPool, File, Unit);
     uint64_t RuntimeLang =
         dwarf::toUnsigned(InputDIE.find(dwarf::DW_AT_APPLE_runtime_class))
             .value_or(0);
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinkerDeclContext.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinkerDeclContext.cpp
index c9c8dddce9c44..66a1ba9c6711f 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinkerDeclContext.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinkerDeclContext.cpp
@@ -84,24 +84,26 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
     break;
   }
 
-  StringRef NameRef;
+  StringRef Name = DIE.getShortName();
+  StringRef NameForUniquing;
   StringRef FileRef;
 
   if (const char *LinkageName = DIE.getLinkageName())
-    NameRef = StringPool.internString(LinkageName);
-  else if (const char *ShortName = DIE.getShortName())
-    NameRef = StringPool.internString(ShortName);
+    NameForUniquing = StringPool.internString(LinkageName);
+  else if (!Name.empty())
+    NameForUniquing = StringPool.internString(Name);
 
-  bool IsAnonymousNamespace = NameRef.empty() && Tag == dwarf::DW_TAG_namespace;
+  bool IsAnonymousNamespace =
+      NameForUniquing.empty() && Tag == dwarf::DW_TAG_namespace;
   if (IsAnonymousNamespace) {
     // FIXME: For dsymutil-classic compatibility. I think uniquing within
     // anonymous namespaces is wrong. There is no ODR guarantee there.
-    NameRef = "(anonymous namespace)";
+    NameForUniquing = "(anonymous namespace)";
   }
 
   if (Tag != dwarf::DW_TAG_class_type && Tag != dwarf::DW_TAG_structure_type &&
       Tag != dwarf::DW_TAG_union_type &&
-      Tag != dwarf::DW_TAG_enumeration_type && NameRef.empty())
+      Tag != dwarf::DW_TAG_enumeration_type && NameForUniquing.empty())
     return PointerIntPair<DeclContext *, 1>(nullptr);
 
   unsigned Line = 0;
@@ -140,10 +142,10 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
     }
   }
 
-  if (!Line && NameRef.empty())
+  if (!Line && NameForUniquing.empty())
     return PointerIntPair<DeclContext *, 1>(nullptr);
 
-  // We hash NameRef, which is the mangled name, in order to get most
+  // We hash NameForUniquing, which is the mangled name, in order to get most
   // overloaded functions resolve correctly.
   //
   // Strictly speaking, hashing the Tag is only necessary for a
@@ -153,7 +155,8 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
   // FIXME: dsymutil-classic won't unique the same type presented
   // once as a struct and once as a class. Using the Tag in the fully
   // qualified name hash to get the same effect.
-  unsigned Hash = hash_combine(Context.getQualifiedNameHash(), Tag, NameRef);
+  unsigned Hash =
+      hash_combine(Context.getQualifiedNameHash(), Tag, NameForUniquing);
 
   // FIXME: dsymutil-classic compatibility: when we don't have a name,
   // use the filename.
@@ -161,15 +164,16 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
     Hash = hash_combine(Hash, FileRef);
 
   // Now look if this context already exists.
-  DeclContext Key(Hash, Line, ByteSize, Tag, NameRef, FileRef, Context);
+  DeclContext Key(Hash, Line, ByteSize, Tag, Name, NameForUniquing, FileRef,
+                  Context);
   auto ContextIter = Contexts.find(&Key);
 
   if (ContextIter == Contexts.end()) {
     // The context wasn't found.
     bool Inserted;
-    DeclContext *NewContext =
-        new (Allocator) DeclContext(Hash, Line, ByteSize, Tag, NameRef, FileRef,
-                                    Context, DIE, U.getUniqueID());
+    DeclContext *NewContext = new (Allocator)
+        DeclContext(Hash, Line, ByteSize, Tag, Name, NameForUniquing, FileRef,
+                    Context, DIE, U.getUniqueID());
     std::tie(ContextIter, Inserted) = Contexts.insert(NewContext);
     assert(Inserted && "Failed to insert DeclContext");
     (void)Inserted;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index 8967a2eb1749e..49674b4c32de0 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -87,6 +87,19 @@ Error NativeSession::createFromPdb(std::unique_ptr<MemoryBuffer> Buffer,
   return Error::success();
 }
 
+static Error validatePdbMagic(StringRef PdbPath) {
+  file_magic Magic;
+  if (auto EC = identify_magic(PdbPath, Magic))
+    return make_error<RawError>(EC);
+
+  if (Magic != file_magic::pdb)
+    return make_error<RawError>(
+        raw_error_code::invalid_format,
+        "The input file did not contain the pdb file magic.");
+
+  return Error::success();
+}
+
 static Expected<std::unique_ptr<PDBFile>>
 loadPdbFile(StringRef PdbPath, std::unique_ptr<BumpPtrAllocator> &Allocator) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
@@ -97,10 +110,8 @@ loadPdbFile(StringRef PdbPath, std::unique_ptr<BumpPtrAllocator> &Allocator) {
   std::unique_ptr<llvm::MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
 
   PdbPath = Buffer->getBufferIdentifier();
-  file_magic Magic;
-  auto EC = identify_magic(PdbPath, Magic);
-  if (EC || Magic != file_magic::pdb)
-    return make_error<RawError>(EC);
+  if (auto EC = validatePdbMagic(PdbPath))
+    return std::move(EC);
 
   auto Stream = std::make_unique<MemoryBufferByteStream>(
       std::move(Buffer), llvm::endianness::little);
@@ -152,10 +163,8 @@ Error NativeSession::createFromExe(StringRef ExePath,
   if (!PdbPath)
     return PdbPath.takeError();
 
-  file_magic Magic;
-  auto EC = identify_magic(PdbPath.get(), Magic);
-  if (EC || Magic != file_magic::pdb)
-    return make_error<RawError>(EC);
+  if (auto EC = validatePdbMagic(PdbPath.get()))
+    return EC;
 
   auto Allocator = std::make_unique<BumpPtrAllocator>();
   auto File = loadPdbFile(PdbPath.get(), Allocator);
diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp
index 8e9e89f0df353..daa40d4e9dcc6 100644
--- a/llvm/lib/Support/AllocToken.cpp
+++ b/llvm/lib/Support/AllocToken.cpp
@@ -24,6 +24,7 @@ llvm::getAllocTokenModeFromString(StringRef Name) {
       .Case("random", AllocTokenMode::Random)
       .Case("typehash", AllocTokenMode::TypeHash)
       .Case("typehashpointersplit", AllocTokenMode::TypeHashPointerSplit)
+      .Case("default", DefaultAllocTokenMode)
       .Default(std::nullopt);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 73d9699f71477..5da6181ba36dd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -162,8 +162,7 @@ class AArch64AsmPrinter : public AsmPrinter {
                                           Register ScratchReg,
                                           AArch64PACKey::ID Key,
                                           AArch64PAuth::AuthCheckMethod Method,
-                                          bool ShouldTrap,
-                                          const MCSymbol *OnFailure);
+                                          const MCSymbol *OnFailure = nullptr);
 
   // Check authenticated LR before tail calling.
   void emitPtrauthTailCallHardening(const MachineInstr *TC);
@@ -1937,14 +1936,19 @@ Register AArch64AsmPrinter::emitPtrauthDiscriminator(uint16_t Disc,
   return ScratchReg;
 }
 
-/// Emits a code sequence to check an authenticated pointer value.
+/// Emit a code sequence to check an authenticated pointer value.
 ///
-/// If OnFailure argument is passed, jump there on check failure instead
-/// of proceeding to the next instruction (only if ShouldTrap is false).
+/// This function emits a sequence of instructions that checks if TestedReg was
+/// authenticated successfully. On success, execution continues at the next
+/// instruction after the sequence.
+///
+/// The action performed on failure depends on the OnFailure argument:
+/// * if OnFailure is not nullptr, control is transferred to that label after
+///   clearing the PAC field
+/// * otherwise, BRK instruction is emitted to generate an error
 void AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue(
     Register TestedReg, Register ScratchReg, AArch64PACKey::ID Key,
-    AArch64PAuth::AuthCheckMethod Method, bool ShouldTrap,
-    const MCSymbol *OnFailure) {
+    AArch64PAuth::AuthCheckMethod Method, const MCSymbol *OnFailure) {
   // Insert a sequence to check if authentication of TestedReg succeeded,
   // such as:
   //
@@ -1981,7 +1985,7 @@ void AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue(
                        .addReg(getWRegFromXReg(ScratchReg))
                        .addReg(TestedReg)
                        .addImm(0));
-    assert(ShouldTrap && !OnFailure && "DummyLoad always traps on error");
+    assert(!OnFailure && "DummyLoad always traps on error");
     return;
   }
 
@@ -2035,15 +2039,14 @@ void AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue(
     llvm_unreachable("Unsupported check method");
   }
 
-  if (ShouldTrap) {
-    assert(!OnFailure && "Cannot specify OnFailure with ShouldTrap");
+  if (!OnFailure) {
     // Trapping sequences do a 'brk'.
     //  brk #<0xc470 + aut key>
     EmitToStreamer(MCInstBuilder(AArch64::BRK).addImm(0xc470 | Key));
   } else {
     // Non-trapping checked sequences return the stripped result in TestedReg,
-    // skipping over success-only code (such as re-signing the pointer) if
-    // there is one.
+    // skipping over success-only code (such as re-signing the pointer) by
+    // jumping to OnFailure label.
     // Note that this can introduce an authentication oracle (such as based on
     // the high bits of the re-signed value).
 
@@ -2068,12 +2071,9 @@ void AArch64AsmPrinter::emitPtrauthCheckAuthenticatedValue(
           MCInstBuilder(XPACOpc).addReg(TestedReg).addReg(TestedReg));
     }
 
-    if (OnFailure) {
-      //  b Lend
-      EmitToStreamer(
-          MCInstBuilder(AArch64::B)
-              .addExpr(MCSymbolRefExpr::create(OnFailure, OutContext)));
-    }
+    //  b Lend
+    const auto *OnFailureExpr = MCSymbolRefExpr::create(OnFailure, OutContext);
+    EmitToStreamer(MCInstBuilder(AArch64::B).addExpr(OnFailureExpr));
   }
 
   // If the auth check succeeds, we can continue.
@@ -2100,9 +2100,8 @@ void AArch64AsmPrinter::emitPtrauthTailCallHardening(const MachineInstr *TC) {
          "Neither x16 nor x17 is available as a scratch register");
   AArch64PACKey::ID Key =
       AArch64FI->shouldSignWithBKey() ? AArch64PACKey::IB : AArch64PACKey::IA;
-  emitPtrauthCheckAuthenticatedValue(
-      AArch64::LR, ScratchReg, Key, LRCheckMethod,
-      /*ShouldTrap=*/true, /*OnFailure=*/nullptr);
+  emitPtrauthCheckAuthenticatedValue(AArch64::LR, ScratchReg, Key,
+                                     LRCheckMethod);
 }
 
 void AArch64AsmPrinter::emitPtrauthAuthResign(
@@ -2176,9 +2175,8 @@ void AArch64AsmPrinter::emitPtrauthAuthResign(
     if (IsAUTPAC && !ShouldTrap)
       EndSym = createTempSymbol("resign_end_");
 
-    emitPtrauthCheckAuthenticatedValue(AUTVal, Scratch, AUTKey,
-                                       AArch64PAuth::AuthCheckMethod::XPAC,
-                                       ShouldTrap, EndSym);
+    emitPtrauthCheckAuthenticatedValue(
+        AUTVal, Scratch, AUTKey, AArch64PAuth::AuthCheckMethod::XPAC, EndSym);
   }
 
   // We already emitted unchecked and checked-but-non-trapping AUTs.
@@ -2517,9 +2515,7 @@ void AArch64AsmPrinter::LowerMOVaddrPAC(const MachineInstr &MI) {
                                                      : AArch64PACKey::DA);
 
         emitPtrauthCheckAuthenticatedValue(AArch64::X16, AArch64::X17, AuthKey,
-                                           AArch64PAuth::AuthCheckMethod::XPAC,
-                                           /*ShouldTrap=*/true,
-                                           /*OnFailure=*/nullptr);
+                                           AArch64PAuth::AuthCheckMethod::XPAC);
       }
     } else {
       EmitToStreamer(MCInstBuilder(AArch64::LDRXui)
@@ -2652,9 +2648,7 @@ void AArch64AsmPrinter::LowerLOADgotAUTH(const MachineInstr &MI) {
         (AuthOpcode == AArch64::AUTIA ? AArch64PACKey::IA : AArch64PACKey::DA);
 
     emitPtrauthCheckAuthenticatedValue(AuthResultReg, AArch64::X17, AuthKey,
-                                       AArch64PAuth::AuthCheckMethod::XPAC,
-                                       /*ShouldTrap=*/true,
-                                       /*OnFailure=*/nullptr);
+                                       AArch64PAuth::AuthCheckMethod::XPAC);
 
     emitMovXReg(DstReg, AuthResultReg);
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 1765d054a3c0d..123fc5bf37a19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -437,6 +437,13 @@ std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
   return {Lo.getReg(0), Hi.getReg(0)};
 }
 
+std::pair<Register, Register>
+RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
+  auto [Lo32, Hi32] = unpackAExt(Reg);
+  return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
+          B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
+}
+
 void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
   Register Lo, Hi;
   switch (MI.getOpcode()) {
@@ -629,14 +636,21 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
 void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   assert(MRI.getType(Dst) == V2S16);
-  auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
-  auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
   unsigned Opc = MI.getOpcode();
   auto Flags = MI.getFlags();
-  auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
-  auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
-  auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
-  auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
+
+  if (MI.getNumOperands() == 2) {
+    auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+    auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
+    auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
+    B.buildMergeLikeInstr(Dst, {Lo, Hi});
+    MI.eraseFromParent();
+    return;
+  }
+
+  assert(MI.getNumOperands() == 3);
+  auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
+  auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
   auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
   auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
   B.buildMergeLikeInstr(Dst, {Lo, Hi});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index e7598f888e4b5..4f1c3c02fa5d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -118,6 +118,7 @@ class RegBankLegalizeHelper {
   std::pair<Register, Register> unpackZExt(Register Reg);
   std::pair<Register, Register> unpackSExt(Register Reg);
   std::pair<Register, Register> unpackAExt(Register Reg);
+  std::pair<Register, Register> unpackAExtTruncS16(Register Reg);
   void lowerUnpackBitShift(MachineInstr &MI);
   void lowerV_BFE(MachineInstr &MI);
   void lowerS_BFE(MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 097457f9f0deb..6ec51e1be8aca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -955,6 +955,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
       .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
 
+  // FNEG and FABS are either folded as source modifiers or can be selected as
+  // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
+  // targets without SALU float we still select them as VGPR since there would
+  // be no real sgpr use.
+  addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
+      .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
+      .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
+      .Div(S16, {{Vgpr16}, {Vgpr16}})
+      .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
+      .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
+      .Div(S32, {{Vgpr32}, {Vgpr32}})
+      .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
+      .Div(S64, {{Vgpr64}, {Vgpr64}})
+      .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
+      .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
+      .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
+      .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
+      .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
+
   addRulesForGOpcs({G_FPTOUI})
       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
       .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 8ef5874d7baf9..da287e0243d71 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -12,6 +12,7 @@
 #include "SIDefines.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -1341,12 +1342,9 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
     return;
 
   O << Name;
-  for (int I = 0; I < NumOps; ++I) {
-    if (I != 0)
-      O << ',';
-
-    O << !!(Ops[I] & Mod);
-  }
+  ListSeparator Sep(",");
+  for (int I = 0; I < NumOps; ++I)
+    O << Sep << !!(Ops[I] & Mod);
 
   if (HasDstSel) {
     O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL);
@@ -1584,14 +1582,10 @@ void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo,
     O << formatHex(static_cast<uint64_t>(Val));
   } else {
     O << "gpr_idx(";
-    bool NeedComma = false;
+    ListSeparator Sep(",");
     for (unsigned ModeId = ID_MIN; ModeId <= ID_MAX; ++ModeId) {
-      if (Val & (1 << ModeId)) {
-        if (NeedComma)
-          O << ',';
-        O << IdSymbolic[ModeId];
-        NeedComma = true;
-      }
+      if (Val & (1 << ModeId))
+        O << Sep << IdSymbolic[ModeId];
     }
     O << ')';
   }
@@ -1798,25 +1792,16 @@ void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo,
   bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA);
   bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt;
 
-  bool NeedSpace = false;
+  ListSeparator Sep(" ");
 
-  if (!IsDefaultVmcnt || PrintAll) {
-    O << "vmcnt(" << Vmcnt << ')';
-    NeedSpace = true;
-  }
+  if (!IsDefaultVmcnt || PrintAll)
+    O << Sep << "vmcnt(" << Vmcnt << ')';
 
-  if (!IsDefaultExpcnt || PrintAll) {
-    if (NeedSpace)
-      O << ' ';
-    O << "expcnt(" << Expcnt << ')';
-    NeedSpace = true;
-  }
+  if (!IsDefaultExpcnt || PrintAll)
+    O << Sep << "expcnt(" << Expcnt << ')';
 
-  if (!IsDefaultLgkmcnt || PrintAll) {
-    if (NeedSpace)
-      O << ' ';
-    O << "lgkmcnt(" << Lgkmcnt << ')';
-  }
+  if (!IsDefaultLgkmcnt || PrintAll)
+    O << Sep << "lgkmcnt(" << Lgkmcnt << ')';
 }
 
 void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
@@ -1832,14 +1817,10 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
     StringRef Name;
     unsigned Val;
     bool IsDefault;
-    bool NeedSpace = false;
+    ListSeparator Sep(" ");
     while (decodeDepCtr(Imm16, Id, Name, Val, IsDefault, STI)) {
-      if (!IsDefault || !HasNonDefaultVal) {
-        if (NeedSpace)
-          O << ' ';
-        O << Name << '(' << Val << ')';
-        NeedSpace = true;
-      }
+      if (!IsDefault || !HasNonDefaultVal)
+        O << Sep << Name << '(' << Val << ')';
     }
   } else {
     O << formatHex(Imm16);
diff --git a/llvm/lib/Target/BPF/BPF.td b/llvm/lib/Target/BPF/BPF.td
index a7aa6274f5ac1..436b7eef600e7 100644
--- a/llvm/lib/Target/BPF/BPF.td
+++ b/llvm/lib/Target/BPF/BPF.td
@@ -31,6 +31,10 @@ def MisalignedMemAccess : SubtargetFeature<"allows-misaligned-mem-access",
                                            "AllowsMisalignedMemAccess", "true",
                                            "Allows misaligned memory access">;
 
+def AllowBuiltinCall : SubtargetFeature<"allow-builtin-calls",
+                                        "AllowBuiltinCalls", "true",
+                                        "Allow calls to builtin functions">;
+
 def : Proc<"generic", []>;
 def : Proc<"v1", []>;
 def : Proc<"v2", []>;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index a8d1faa85116b..4485c41b4c0fa 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -208,6 +208,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   HasMovsx = STI.hasMovsx();
 
   AllowsMisalignedMemAccess = STI.getAllowsMisalignedMemAccess();
+  AllowBuiltinCalls = STI.getAllowBuiltinCalls();
 }
 
 bool BPFTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
@@ -567,9 +568,10 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (StringRef(E->getSymbol()) != BPF_TRAP) {
       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
-      fail(CLI.DL, DAG,
-           Twine("A call to built-in function '" + StringRef(E->getSymbol()) +
-                 "' is not supported."));
+      if (!AllowBuiltinCalls)
+        fail(CLI.DL, DAG,
+             Twine("A call to built-in function '" + StringRef(E->getSymbol()) +
+                   "' is not supported."));
     }
   }
 
@@ -1196,3 +1198,18 @@ bool BPFTargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   return true;
 }
+
+bool BPFTargetLowering::shouldSignExtendTypeInLibCall(Type *Ty,
+                                                      bool IsSigned) const {
+  return IsSigned || Ty->isIntegerTy(32);
+}
+
+bool BPFTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
+    const Type *RetTy) const {
+  // At minimal return Outs.size() <= 1, or check valid types in CC.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, getHasAlu32() ? RetCC_BPF32 : RetCC_BPF64);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 8607e4f8c9e69..a5036e31cb61d 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -68,6 +68,8 @@ class BPFTargetLowering : public TargetLowering {
   // Allows Misalignment
   bool AllowsMisalignedMemAccess;
 
+  bool AllowBuiltinCalls;
+
   SDValue LowerSDIVSREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -163,6 +165,14 @@ class BPFTargetLowering : public TargetLowering {
   MachineBasicBlock *
   EmitInstrWithCustomInserterLDimm64(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
+
+  // Returns true if arguments should be sign-extended in lib calls.
+  bool shouldSignExtendTypeInLibCall(Type *Ty, bool IsSigned) const override;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool IsVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context, const Type *RetTy) const override;
 };
 }
 
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 726f8f4b39827..77a1a5fe7444c 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -70,6 +70,7 @@ void BPFSubtarget::initializeEnvironment() {
   HasLoadAcqStoreRel = false;
   HasGotox = false;
   AllowsMisalignedMemAccess = false;
+  AllowBuiltinCalls = false;
 }
 
 void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.h b/llvm/lib/Target/BPF/BPFSubtarget.h
index 24eff862224b0..40751fc9b7454 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -70,6 +70,8 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   bool HasLdsx, HasMovsx, HasBswap, HasSdivSmod, HasGotol, HasStoreImm,
       HasLoadAcqStoreRel, HasGotox;
 
+  bool AllowBuiltinCalls;
+
   std::unique_ptr<CallLowering> CallLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
@@ -101,6 +103,7 @@ class BPFSubtarget : public BPFGenSubtargetInfo {
   bool hasStoreImm() const { return HasStoreImm; }
   bool hasLoadAcqStoreRel() const { return HasLoadAcqStoreRel; }
   bool hasGotox() const { return HasGotox; }
+  bool getAllowBuiltinCalls() const { return AllowBuiltinCalls; }
 
   bool isLittleEndian() const { return IsLittleEndian; }
 
diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
index 9f1616f6960fe..5f18c37ef1125 100644
--- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
+++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp
@@ -29,20 +29,6 @@ static const int MaxVecSize = 4;
 
 using namespace llvm;
 
-// Recursively creates an array-like version of a given vector type.
-static Type *equivalentArrayTypeFromVector(Type *T) {
-  if (auto *VecTy = dyn_cast<VectorType>(T))
-    return ArrayType::get(VecTy->getElementType(),
-                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
-  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
-    Type *NewElementType =
-        equivalentArrayTypeFromVector(ArrayTy->getElementType());
-    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
-  }
-  // If it's not a vector or array, return the original type.
-  return T;
-}
-
 class DXILDataScalarizationLegacy : public ModulePass {
 
 public:
@@ -121,12 +107,25 @@ DataScalarizerVisitor::lookupReplacementGlobal(Value *CurrOperand) {
 static bool isVectorOrArrayOfVectors(Type *T) {
   if (isa<VectorType>(T))
     return true;
-  if (ArrayType *ArrType = dyn_cast<ArrayType>(T))
-    return isa<VectorType>(ArrType->getElementType()) ||
-           isVectorOrArrayOfVectors(ArrType->getElementType());
+  if (ArrayType *ArrayTy = dyn_cast<ArrayType>(T))
+    return isVectorOrArrayOfVectors(ArrayTy->getElementType());
   return false;
 }
 
+// Recursively creates an array-like version of a given vector type.
+static Type *equivalentArrayTypeFromVector(Type *T) {
+  if (auto *VecTy = dyn_cast<VectorType>(T))
+    return ArrayType::get(VecTy->getElementType(),
+                          dyn_cast<FixedVectorType>(VecTy)->getNumElements());
+  if (auto *ArrayTy = dyn_cast<ArrayType>(T)) {
+    Type *NewElementType =
+        equivalentArrayTypeFromVector(ArrayTy->getElementType());
+    return ArrayType::get(NewElementType, ArrayTy->getNumElements());
+  }
+  // If it's not a vector or array, return the original type.
+  return T;
+}
+
 bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
   Type *AllocatedType = AI.getAllocatedType();
   if (!isVectorOrArrayOfVectors(AllocatedType))
@@ -135,7 +134,7 @@ bool DataScalarizerVisitor::visitAllocaInst(AllocaInst &AI) {
   IRBuilder<> Builder(&AI);
   Type *NewType = equivalentArrayTypeFromVector(AllocatedType);
   AllocaInst *ArrAlloca =
-      Builder.CreateAlloca(NewType, nullptr, AI.getName() + ".scalarize");
+      Builder.CreateAlloca(NewType, nullptr, AI.getName() + ".scalarized");
   ArrAlloca->setAlignment(AI.getAlign());
   AI.replaceAllUsesWith(ArrAlloca);
   AI.eraseFromParent();
@@ -303,78 +302,44 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
 bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   GEPOperator *GOp = cast<GEPOperator>(&GEPI);
   Value *PtrOperand = GOp->getPointerOperand();
-  Type *NewGEPType = GOp->getSourceElementType();
-
-  // Unwrap GEP ConstantExprs to find the base operand and element type
-  while (auto *GEPCE = dyn_cast_or_null<GEPOperator>(
-             dyn_cast<ConstantExpr>(PtrOperand))) {
-    GOp = GEPCE;
-    PtrOperand = GEPCE->getPointerOperand();
-    NewGEPType = GEPCE->getSourceElementType();
-  }
-
-  Type *const OrigGEPType = NewGEPType;
-  Value *const OrigOperand = PtrOperand;
-
-  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand)) {
-    NewGEPType = NewGlobal->getValueType();
-    PtrOperand = NewGlobal;
-  } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrOperand)) {
-    Type *AllocatedType = Alloca->getAllocatedType();
-    if (isa<ArrayType>(AllocatedType) &&
-        AllocatedType != GOp->getResultElementType())
-      NewGEPType = AllocatedType;
-  } else
-    return false; // Only GEPs into an alloca or global variable are considered
-
-  // Defer changing i8 GEP types until dxil-flatten-arrays
-  if (OrigGEPType->isIntegerTy(8))
-    NewGEPType = OrigGEPType;
-
-  // If the original type is a "sub-type" of the new type, then ensure the gep
-  // correctly zero-indexes the extra dimensions to keep the offset calculation
-  // correct.
-  // Eg:
-  //  i32, [4 x i32] and [8 x [4 x i32]] are sub-types of [8 x [4 x i32]], etc.
-  //
-  // So then:
-  //   gep [4 x i32] %idx
-  //     -> gep [8 x [4 x i32]], i32 0, i32 %idx
-  //   gep i32 %idx
-  //     -> gep [8 x [4 x i32]], i32 0, i32 0, i32 %idx
-  uint32_t MissingDims = 0;
-  Type *SubType = NewGEPType;
-
-  // The new type will be in its array version; so match accordingly.
-  Type *const GEPArrType = equivalentArrayTypeFromVector(OrigGEPType);
-
-  while (SubType != GEPArrType) {
-    MissingDims++;
-
-    ArrayType *ArrType = dyn_cast<ArrayType>(SubType);
-    if (!ArrType) {
-      assert(SubType == GEPArrType &&
-             "GEP uses an DXIL invalid sub-type of alloca/global variable");
-      break;
-    }
-
-    SubType = ArrType->getElementType();
+  Type *GEPType = GOp->getSourceElementType();
+
+  // Replace a GEP ConstantExpr pointer operand with a GEP instruction so that
+  // it can be visited
+  if (auto *PtrOpGEPCE = dyn_cast<ConstantExpr>(PtrOperand);
+      PtrOpGEPCE && PtrOpGEPCE->getOpcode() == Instruction::GetElementPtr) {
+    GetElementPtrInst *OldGEPI =
+        cast<GetElementPtrInst>(PtrOpGEPCE->getAsInstruction());
+    OldGEPI->insertBefore(GEPI.getIterator());
+
+    IRBuilder<> Builder(&GEPI);
+    SmallVector<Value *> Indices(GEPI.indices());
+    Value *NewGEP =
+        Builder.CreateGEP(GEPI.getSourceElementType(), OldGEPI, Indices,
+                          GEPI.getName(), GEPI.getNoWrapFlags());
+    assert(isa<GetElementPtrInst>(NewGEP) &&
+           "Expected newly-created GEP to be an instruction");
+    GetElementPtrInst *NewGEPI = cast<GetElementPtrInst>(NewGEP);
+
+    GEPI.replaceAllUsesWith(NewGEPI);
+    GEPI.eraseFromParent();
+    visitGetElementPtrInst(*OldGEPI);
+    visitGetElementPtrInst(*NewGEPI);
+    return true;
   }
 
-  bool NeedsTransform = OrigOperand != PtrOperand ||
-                        OrigGEPType != NewGEPType || MissingDims != 0;
+  Type *NewGEPType = equivalentArrayTypeFromVector(GEPType);
+  Value *NewPtrOperand = PtrOperand;
+  if (GlobalVariable *NewGlobal = lookupReplacementGlobal(PtrOperand))
+    NewPtrOperand = NewGlobal;
 
+  bool NeedsTransform = NewPtrOperand != PtrOperand || NewGEPType != GEPType;
   if (!NeedsTransform)
     return false;
 
   IRBuilder<> Builder(&GEPI);
-  SmallVector<Value *, MaxVecSize> Indices;
-
-  for (uint32_t I = 0; I < MissingDims; I++)
-    Indices.push_back(Builder.getInt32(0));
-  llvm::append_range(Indices, GOp->indices());
-
-  Value *NewGEP = Builder.CreateGEP(NewGEPType, PtrOperand, Indices,
+  SmallVector<Value *, MaxVecSize> Indices(GOp->idx_begin(), GOp->idx_end());
+  Value *NewGEP = Builder.CreateGEP(NewGEPType, NewPtrOperand, Indices,
                                     GOp->getName(), GOp->getNoWrapFlags());
 
   GOp->replaceAllUsesWith(NewGEP);
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 632b44c3cf635..a59586bdb4476 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -554,6 +554,7 @@ static void cacheDIVar(FrameDataInfo &FrameData,
         DIVarCache.insert({V, (*I)->getVariable()});
     };
     CacheIt(findDVRDeclares(V));
+    CacheIt(findDVRDeclareValues(V));
   }
 }
 
@@ -1142,6 +1143,47 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
         for_each(DVRs, SalvageOne);
       }
 
+      TinyPtrVector<DbgVariableRecord *> DVRDeclareValues =
+          findDVRDeclareValues(Def);
+      // Try best to find dbg.declare_value. If the spill is a temp, there may
+      // not be a direct dbg.declare_value. Walk up the load chain to find one
+      // from an alias.
+      if (F->getSubprogram()) {
+        auto *CurDef = Def;
+        while (DVRDeclareValues.empty() && isa<LoadInst>(CurDef)) {
+          auto *LdInst = cast<LoadInst>(CurDef);
+          // Only consider ptr to ptr same type load.
+          if (LdInst->getPointerOperandType() != LdInst->getType())
+            break;
+          CurDef = LdInst->getPointerOperand();
+          if (!isa<AllocaInst, LoadInst>(CurDef))
+            break;
+          DVRDeclareValues = findDVRDeclareValues(CurDef);
+        }
+      }
+
+      auto SalvageOneCoro = [&](auto *DDI) {
+        // This dbg.declare_value is preserved for all coro-split function
+        // fragments. It will be unreachable in the main function, and
+        // processed by coro::salvageDebugInfo() by the Cloner. However, convert
+        // it to a dbg.declare to make sure future passes don't have to deal
+        // with a dbg.declare_value.
+        auto *VAM = ValueAsMetadata::get(CurrentReload);
+        Type *Ty = VAM->getValue()->getType();
+        // If the metadata type is not a pointer, emit a dbg.value instead.
+        DbgVariableRecord *NewDVR = new DbgVariableRecord(
+            ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
+            DDI->getExpression(), DDI->getDebugLoc(),
+            Ty->isPointerTy() ? DbgVariableRecord::LocationType::Declare
+                              : DbgVariableRecord::LocationType::Value);
+        Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
+            NewDVR, Builder.GetInsertPoint());
+        // This dbg.declare_value is for the main function entry point.  It
+        // will be deleted in all coro-split functions.
+        coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/);
+      };
+      for_each(DVRDeclareValues, SalvageOneCoro);
+
       // If we have a single edge PHINode, remove it and replace it with a
       // reload from the coroutine frame. (We already took care of multi edge
       // PHINodes by normalizing them in the rewritePHIs function).
@@ -1925,7 +1967,7 @@ void coro::salvageDebugInfo(
   Function *F = DVR.getFunction();
   // Follow the pointer arithmetic all the way to the incoming
   // function argument and convert into a DIExpression.
-  bool SkipOutermostLoad = DVR.isDbgDeclare();
+  bool SkipOutermostLoad = DVR.isDbgDeclare() || DVR.isDbgDeclareValue();
   Value *OriginalStorage = DVR.getVariableLocationOp(0);
 
   auto SalvagedInfo =
@@ -1939,10 +1981,11 @@ void coro::salvageDebugInfo(
 
   DVR.replaceVariableLocationOp(OriginalStorage, Storage);
   DVR.setExpression(Expr);
-  // We only hoist dbg.declare today since it doesn't make sense to hoist
-  // dbg.value since it does not have the same function wide guarantees that
-  // dbg.declare does.
-  if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
+  // We only hoist dbg.declare and dbg.declare_value today since it doesn't make
+  // sense to hoist dbg.value since it does not have the same function wide
+  // guarantees that dbg.declare does.
+  if (DVR.getType() == DbgVariableRecord::LocationType::Declare ||
+      DVR.getType() == DbgVariableRecord::LocationType::DeclareValue) {
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
@@ -1957,6 +2000,19 @@ void coro::salvageDebugInfo(
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt) {
       DVR.removeFromParent();
+      // If there is a dbg.declare_value being reinserted, insert it as a
+      // dbg.declare instead, so that subsequent passes don't have to deal with
+      // a dbg.declare_value.
+      if (DVR.getType() == DbgVariableRecord::LocationType::DeclareValue) {
+        auto *MD = DVR.getRawLocation();
+        if (auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
+          Type *Ty = VAM->getValue()->getType();
+          if (Ty->isPointerTy())
+            DVR.Type = DbgVariableRecord::LocationType::Declare;
+          else
+            DVR.Type = DbgVariableRecord::LocationType::Value;
+        }
+      }
       (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt);
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8a435accfedfe..0c7d9c0193a03 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1854,12 +1854,6 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
     return getOperand(I + 1)->isDefinedOutsideLoopRegions();
   }
 
-  bool areAllOperandsInvariant() const {
-    return all_of(operands(), [](VPValue *Op) {
-      return Op->isDefinedOutsideLoopRegions();
-    });
-  }
-
 public:
   VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef<VPValue *> Operands,
                    const VPIRFlags &Flags = {},
@@ -1898,14 +1892,7 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
-  bool usesFirstLaneOnly(const VPValue *Op) const override {
-    assert(is_contained(operands(), Op) &&
-           "Op must be an operand of the recipe");
-    if (Op == getOperand(0))
-      return isPointerLoopInvariant();
-    else
-      return !isPointerLoopInvariant() && Op->isDefinedOutsideLoopRegions();
-  }
+  bool usesFirstLaneOnly(const VPValue *Op) const override;
 
 protected:
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5ea9dd349e06f..54fdec3bcf4a1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2536,6 +2536,11 @@ void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
+  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
+  return vputils::isSingleScalar(Op);
+}
+
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
   assert(State.VF.isVector() && "not widening");
   // Construct a vector GEP by widening the operands of the scalar GEP as
@@ -2544,51 +2549,32 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
   // is vector-typed. Thus, to keep the representation compact, we only use
   // vector-typed operands for loop-varying values.
 
-  if (areAllOperandsInvariant()) {
-    // If we are vectorizing, but the GEP has only loop-invariant operands,
-    // the GEP we build (by only using vector-typed operands for
-    // loop-varying values) would be a scalar pointer. Thus, to ensure we
-    // produce a vector of pointers, we need to either arbitrarily pick an
-    // operand to broadcast, or broadcast a clone of the original GEP.
-    // Here, we broadcast a clone of the original.
-    //
-    // TODO: If at some point we decide to scalarize instructions having
-    //       loop-invariant operands, this special case will no longer be
-    //       required. We would add the scalarization decision to
-    //       collectLoopScalars() and teach getVectorValue() to broadcast
-    //       the lane-zero scalar value.
-    SmallVector<Value *> Ops;
-    for (unsigned I = 0, E = getNumOperands(); I != E; I++)
-      Ops.push_back(State.get(getOperand(I), VPLane(0)));
-
-    auto *NewGEP =
-        State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops),
-                                "", getGEPNoWrapFlags());
-    Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP);
-    State.set(this, Splat);
-  } else {
-    // If the GEP has at least one loop-varying operand, we are sure to
-    // produce a vector of pointers unless VF is scalar.
-    // The pointer operand of the new GEP. If it's loop-invariant, we
-    // won't broadcast it.
-    auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
-
-    // Collect all the indices for the new GEP. If any index is
-    // loop-invariant, we won't broadcast it.
-    SmallVector<Value *, 4> Indices;
-    for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
-      VPValue *Operand = getOperand(I);
-      Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
-    }
-
-    // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
-    // but it should be a vector, otherwise.
-    auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
-                                           "", getGEPNoWrapFlags());
-    assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
-           "NewGEP is not a pointer vector");
-    State.set(this, NewGEP);
-  }
+  assert(
+      any_of(operands(),
+             [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) &&
+      "Expected at least one loop-variant operand");
+
+  // If the GEP has at least one loop-varying operand, we are sure to
+  // produce a vector of pointers unless VF is scalar.
+  // The pointer operand of the new GEP. If it's loop-invariant, we
+  // won't broadcast it.
+  auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant());
+
+  // Collect all the indices for the new GEP. If any index is
+  // loop-invariant, we won't broadcast it.
+  SmallVector<Value *, 4> Indices;
+  for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
+    VPValue *Operand = getOperand(I);
+    Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1)));
+  }
+
+  // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+  // but it should be a vector, otherwise.
+  auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices,
+                                         "", getGEPNoWrapFlags());
+  assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
+         "NewGEP is not a pointer vector");
+  State.set(this, NewGEP);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e7a8773be067b..89b490e960f33 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1451,7 +1451,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPWidenGEPRecipe,
+               VPReplicateRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index 57481724936a3..cab2741be9929 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -12,7 +12,7 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7ffff800), %bb.3(0x00000800)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -46,13 +46,13 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x2
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def $x9, pcsections !0 :: (load (s32) from %ir.pnew)
+  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def renamable $x9, pcsections !0 :: (load (s32) from %ir.pnew)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7ffff800), %bb.3(0x00000800)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x9
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -91,7 +91,7 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7ffff800), %bb.3(0x00000800)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -243,7 +243,7 @@ define i32 @fetch_and_nand(ptr %p) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w8, 2, pcsections !0
   ; CHECK-NEXT:   $w9 = ORNWrs $wzr, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRW killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
@@ -295,7 +295,7 @@ define i32 @fetch_and_or(ptr %p) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $w10 = ORRWrs renamable $w8, renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRW killed renamable $w10, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
@@ -726,7 +726,7 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -750,7 +750,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -773,7 +773,7 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -797,7 +797,7 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -821,7 +821,7 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -845,7 +845,7 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -869,7 +869,7 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 11, implicit killed $nzcv, pcsections !0
@@ -895,7 +895,7 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, pcsections !0
@@ -923,10 +923,10 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -951,10 +951,10 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -977,7 +977,7 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -1001,7 +1001,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1024,7 +1024,7 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -1048,7 +1048,7 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -1072,7 +1072,7 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -1096,7 +1096,7 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
@@ -1120,7 +1120,7 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 11, implicit killed $nzcv, pcsections !0
@@ -1146,7 +1146,7 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, pcsections !0
@@ -1174,10 +1174,10 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1202,10 +1202,10 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:  successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1230,7 +1230,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7ffff800), %bb.3(0x00000800)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
@@ -1273,7 +1273,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7ffff800), %bb.3(0x00000800)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 8, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
diff --git a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
index 50fac819d4afe..e37d8f7da4bfc 100644
--- a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
+++ b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -aarch64-min-jump-table-entries=4 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -aarch64-min-jump-table-entries=4 -mtriple=arm64-apple-ios -enable-subreg-liveness=false < %s | sed -e "/; kill: /d" | FileCheck %s
+; RUN: llc -aarch64-min-jump-table-entries=4 -mtriple=arm64-apple-ios -enable-subreg-liveness=true  < %s | FileCheck %s
 
 ; Check there's no assert in spilling from implicit-def operands on an
 ; IMPLICIT_DEF.
@@ -92,7 +93,6 @@ define void @widget(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %a
 ; CHECK-NEXT:    ldr x8, [sp, #40] ; 8-byte Reload
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    ; kill: def $w8 killed $w8 killed $x8 def $x8
 ; CHECK-NEXT:    str x8, [sp]
 ; CHECK-NEXT:    bl _fprintf
 ; CHECK-NEXT:    brk #0x1
diff --git a/llvm/test/CodeGen/AArch64/pr151592.mir b/llvm/test/CodeGen/AArch64/pr151592.mir
new file mode 100644
index 0000000000000..dbcc1f8c08e9a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr151592.mir
@@ -0,0 +1,168 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -run-pass=register-coalescer -o - %s | FileCheck %s
+---
+name:            reproducer
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: reproducer
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   undef [[MOVIv2d_ns:%[0-9]+]].qsub1:zpr2 = MOVIv2d_ns 0, implicit-def [[MOVIv2d_ns]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64sp = COPY $xzr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBNZW [[COPY1]], 0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[MOVIv2d_ns:%[0-9]+]].dsub:zpr2 = LDRDui [[COPY2]], 0, implicit-def [[MOVIv2d_ns]].zsub
+  ; CHECK-NEXT:   ST2Twov2d [[MOVIv2d_ns]].zsub_qsub1, [[COPY]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STR_ZXI [[MOVIv2d_ns]].zsub1, [[COPY]], 1
+  ; CHECK-NEXT:   STR_ZXI [[MOVIv2d_ns]].zsub1, [[COPY]], 0
+  ; CHECK-NEXT:   B %bb.1
+  bb.0:
+    liveins: $w0, $x1
+    %0:gpr64common = COPY $x1
+    %1:gpr32 = COPY $w0
+    %2:gpr32 = COPY %1:gpr32
+    undef %8.qsub1:qq = MOVIv2d_ns 0
+    %4:zpr = SUBREG_TO_REG 0, %8.qsub1:qq, %subreg.zsub
+    %5:gpr64sp = COPY $xzr
+
+  bb.1:
+    TBNZW %2:gpr32, 0, %bb.3
+    B %bb.2
+
+  bb.2:
+    %8.dsub:qq = LDRDui %5:gpr64sp, 0, implicit-def %8.qsub0:qq
+    ST2Twov2d %8:qq, %0:gpr64common
+
+  bb.3:
+    STR_ZXI %4:zpr, %0:gpr64common, 1
+    STR_ZXI %4:zpr, %0:gpr64common, 0
+    B %bb.1
+...
+---
+name:            reproducer2
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: reproducer2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   undef [[MOVIv2d_ns:%[0-9]+]].zsub:zpr2 = MOVIv2d_ns 0
+  ; CHECK-NEXT:   [[MOVIv2d_ns:%[0-9]+]].qsub1:zpr2 = MOVIv2d_ns 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64sp = COPY $xzr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBNZW [[COPY1]], 0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[MOVIv2d_ns:%[0-9]+]].dsub:zpr2 = LDRDui [[COPY2]], 0, implicit-def [[MOVIv2d_ns]].zsub
+  ; CHECK-NEXT:   ST2Twov2d [[MOVIv2d_ns]].zsub_qsub1, [[COPY]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STR_ZXI [[MOVIv2d_ns]].zsub1, [[COPY]], 1
+  ; CHECK-NEXT:   STR_ZXI [[MOVIv2d_ns]].zsub1, [[COPY]], 0
+  ; CHECK-NEXT:   B %bb.1
+  bb.0:
+    liveins: $w0, $x1
+    %0:gpr64common = COPY $x1
+    %1:gpr32 = COPY $w0
+    %2:gpr32 = COPY %1:gpr32
+    undef %8.qsub0:qq = MOVIv2d_ns 0
+    %8.qsub1:qq = MOVIv2d_ns 0
+    %4:zpr = SUBREG_TO_REG 0, %8.qsub1:qq, %subreg.zsub
+    %5:gpr64sp = COPY $xzr
+
+  bb.1:
+    TBNZW %2:gpr32, 0, %bb.3
+    B %bb.2
+
+  bb.2:
+    %8.dsub:qq = LDRDui %5:gpr64sp, 0, implicit-def %8.qsub0:qq
+    ST2Twov2d %8:qq, %0:gpr64common
+
+  bb.3:
+    STR_ZXI %4:zpr, %0:gpr64common, 1
+    STR_ZXI %4:zpr, %0:gpr64common, 0
+    B %bb.1
+...
+---
+name:            reproducer3
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: reproducer3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   undef [[MOVIv2d_ns:%[0-9]+]].qsub1:zpr2 = MOVIv2d_ns 0
+  ; CHECK-NEXT:   [[MOVIv2d_ns:%[0-9]+]].zsub:zpr2 = MOVIv2d_ns 0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr64sp = COPY $xzr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBNZW [[COPY1]], 0, %bb.3
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[MOVIv2d_ns:%[0-9]+]].dsub1:zpr2 = LDRDui [[COPY2]], 0, implicit-def [[MOVIv2d_ns]].qsub1
+  ; CHECK-NEXT:   ST2Twov2d [[MOVIv2d_ns]].zsub_qsub1, [[COPY]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   STR_ZXI [[MOVIv2d_ns]].zsub0, [[COPY]], 1
+  ; CHECK-NEXT:   STR_ZXI [[MOVIv2d_ns]].zsub0, [[COPY]], 0
+  ; CHECK-NEXT:   B %bb.1
+  bb.0:
+    liveins: $w0, $x1
+    %0:gpr64common = COPY $x1
+    %1:gpr32 = COPY $w0
+    %2:gpr32 = COPY %1:gpr32
+    undef %8.qsub1:qq = MOVIv2d_ns 0
+    %8.qsub0:qq = MOVIv2d_ns 0
+    %4:zpr = SUBREG_TO_REG 0, %8.qsub0:qq, %subreg.zsub
+    %5:gpr64sp = COPY $xzr
+
+  bb.1:
+    TBNZW %2:gpr32, 0, %bb.3
+    B %bb.2
+
+  bb.2:
+    %8.dsub1:qq = LDRDui %5:gpr64sp, 0, implicit-def %8.qsub1:qq
+    ST2Twov2d %8:qq, %0:gpr64common
+
+  bb.3:
+    STR_ZXI %4:zpr, %0:gpr64common, 1
+    STR_ZXI %4:zpr, %0:gpr64common, 0
+    B %bb.1
+...
diff --git a/llvm/test/CodeGen/AArch64/pr151888.mir b/llvm/test/CodeGen/AArch64/pr151888.mir
new file mode 100644
index 0000000000000..5b66f136cbc4f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr151888.mir
@@ -0,0 +1,17 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -run-pass=register-coalescer -o - %s | FileCheck %s
+---
+name:            reproducer
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: reproducer
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri $xzr, 0, 31
+    ; CHECK-NEXT: $x0 = COPY [[UBFMXri]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %1:gpr32 = COPY killed $wzr
+    %2:gpr64 = SUBREG_TO_REG 0, %1:gpr32, %subreg.sub_32
+    %3:gpr64 = UBFMXri %2:gpr64, 0, 31
+    $x0 = COPY %3:gpr64
+    RET_ReallyLR implicit $x0
+...
diff --git a/llvm/test/CodeGen/AArch64/pr164181-reduced.ll b/llvm/test/CodeGen/AArch64/pr164181-reduced.ll
new file mode 100644
index 0000000000000..192893e6a08cc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr164181-reduced.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+;
+; This is a reduced version of pr164181.ll, which failed with subreg liveness enabled
+; after adding the implicit-def for a SUBREG_TO_REG to mark the top 32-bits of a register
+; being written by a MOVi32imm instruction. This previously failed the machine verifier
+; because the liverange for the top 32-bits weren't updated when rematerializing the
+; MOVi32imm.
+;
+; RUN: llc -verify-machineinstrs -enable-subreg-liveness=true < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -enable-subreg-liveness=false < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @f(i1 %var_0, i64 %var_2, i64 %var_11, ptr %arr_3, ptr %arr_4, ptr %arr_7, ptr %arr_13, ptr %invariant.gep875.us, ptr %arrayidx384.us, i16 %0, i1 %tobool435.not.us, ptr %gep876.us, i16 %cond464.in.us, ptr %1, i16 %conv227.us, i1 %cmp378.us) #0 {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w23, -40
+; CHECK-NEXT:    .cfi_offset w24, -48
+; CHECK-NEXT:    .cfi_offset w25, -56
+; CHECK-NEXT:    .cfi_offset w26, -64
+; CHECK-NEXT:    .cfi_offset w27, -72
+; CHECK-NEXT:    .cfi_offset w28, -80
+; CHECK-NEXT:    .cfi_offset w30, -96
+; CHECK-NEXT:    ldrb w9, [sp, #152]
+; CHECK-NEXT:    ldrh w10, [sp, #144]
+; CHECK-NEXT:    mov x19, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    ldr x11, [sp, #136]
+; CHECK-NEXT:    ldrh w12, [sp, #128]
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr x13, [sp, #120]
+; CHECK-NEXT:    ldrb w14, [sp, #112]
+; CHECK-NEXT:    mov w15, wzr
+; CHECK-NEXT:    ldrh w16, [sp, #104]
+; CHECK-NEXT:    ldr x17, [sp, #96]
+; CHECK-NEXT:    mov w18, #149 // =0x95
+; CHECK-NEXT:    movk x19, #58909, lsl #16
+; CHECK-NEXT:    mov w20, #-18978 // =0xffffb5de
+; CHECK-NEXT:    mov w21, #1 // =0x1
+; CHECK-NEXT:    mov w22, #-7680 // =0xffffe200
+; CHECK-NEXT:    mov w23, #36006 // =0x8ca6
+; CHECK-NEXT:    mov x25, xzr
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:  .LBB0_1: // %for.body99.us
+; CHECK-NEXT:    // =>This Loop Header: Depth=1
+; CHECK-NEXT:    // Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 2
+; CHECK-NEXT:    mov w27, w15
+; CHECK-NEXT:    mov x26, x25
+; CHECK-NEXT:    mov x28, x24
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_2: // in Loop: Header=BB0_4 Depth=2
+; CHECK-NEXT:    mov w25, #1 // =0x1
+; CHECK-NEXT:  .LBB0_3: // %for.inc371.us
+; CHECK-NEXT:    // in Loop: Header=BB0_4 Depth=2
+; CHECK-NEXT:    mul w27, w15, w20
+; CHECK-NEXT:    mov x28, xzr
+; CHECK-NEXT:    mov x26, x2
+; CHECK-NEXT:    tbz w0, #0, .LBB0_9
+; CHECK-NEXT:  .LBB0_4: // %for.body194.us
+; CHECK-NEXT:    // Parent Loop BB0_1 Depth=1
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    orr x15, x28, x19
+; CHECK-NEXT:    mov x24, x28
+; CHECK-NEXT:    strh w10, [x13]
+; CHECK-NEXT:    strb w18, [x5]
+; CHECK-NEXT:    str x15, [x4]
+; CHECK-NEXT:    mov w15, w27
+; CHECK-NEXT:    str x8, [x11]
+; CHECK-NEXT:    str x1, [x3]
+; CHECK-NEXT:    tbz w14, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.5: // %if.then327.us
+; CHECK-NEXT:    // in Loop: Header=BB0_4 Depth=2
+; CHECK-NEXT:    cbnz w21, .LBB0_7
+; CHECK-NEXT:  // %bb.6: // %cond.true331.us
+; CHECK-NEXT:    // in Loop: Header=BB0_4 Depth=2
+; CHECK-NEXT:    ldrsb w27, [x8]
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_7: // in Loop: Header=BB0_4 Depth=2
+; CHECK-NEXT:    mov w27, wzr
+; CHECK-NEXT:  .LBB0_8: // %cond.end345.us
+; CHECK-NEXT:    // in Loop: Header=BB0_4 Depth=2
+; CHECK-NEXT:    mov x25, xzr
+; CHECK-NEXT:    strh w27, [x3]
+; CHECK-NEXT:    str x26, [x6]
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_9: // %for.cond376.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    mov x26, xzr
+; CHECK-NEXT:    mov w27, wzr
+; CHECK-NEXT:  .LBB0_10: // %for.body380.us
+; CHECK-NEXT:    // Parent Loop BB0_1 Depth=1
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ands w28, w0, #0x1
+; CHECK-NEXT:    orr x26, x26, #0x1
+; CHECK-NEXT:    strh w16, [x7]
+; CHECK-NEXT:    csel w30, w23, w22, ne
+; CHECK-NEXT:    tst w12, #0xffff
+; CHECK-NEXT:    csel w21, wzr, w27, eq
+; CHECK-NEXT:    cmp w28, #0
+; CHECK-NEXT:    str w30, [x17]
+; CHECK-NEXT:    csel w27, w27, w21, ne
+; CHECK-NEXT:    tbnz w9, #0, .LBB0_10
+; CHECK-NEXT:  // %bb.11: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    mov w21, #1 // =0x1
+; CHECK-NEXT:    b .LBB0_1
+entry:
+  br label %for.body99.us
+
+for.body99.us:                                    ; preds = %for.inc505.us, %entry
+  %mul287985.us = phi i16 [ 0, %entry ], [ %mul287986.us, %for.inc505.us ]
+  %mul354905.us = phi i64 [ 0, %entry ], [ %mul354907.us, %for.inc505.us ]
+  %sub283896.us = phi i64 [ 0, %entry ], [ %sub283897.us, %for.inc505.us ]
+  %conv96880.us4 = phi i64 [ 0, %entry ], [ 0, %for.inc505.us ]
+  br label %for.body194.us
+
+for.body380.us:                                   ; preds = %for.cond376.preheader.us, %for.inc505.us
+  %indvars.iv10181 = phi i64 [ 0, %for.cond376.preheader.us ], [ %indvars.iv.next1019, %for.inc505.us ]
+  %2 = phi i8 [ 0, %for.cond376.preheader.us ], [ %3, %for.inc505.us ]
+  store i16 %0, ptr %invariant.gep875.us, align 2
+  %arrayidx416.us = getelementptr i16, ptr %arr_13, i64 %indvars.iv10181
+  %conv419.us = select i1 %var_0, i32 36006, i32 -7680
+  store i32 %conv419.us, ptr %arrayidx384.us, align 4
+  br i1 %var_0, label %for.inc505.us, label %if.then436.us
+
+if.then436.us:                                    ; preds = %for.body380.us
+  %cond464.in.us6 = load i16, ptr null, align 2
+  %tobool465.not.us = icmp eq i16 %cond464.in.us, 0
+  %spec.select = select i1 %tobool465.not.us, i8 0, i8 %2
+  br label %for.inc505.us
+
+for.inc505.us:                                    ; preds = %if.then436.us, %for.body380.us
+  %3 = phi i8 [ %2, %for.body380.us ], [ %spec.select, %if.then436.us ]
+  %indvars.iv.next1019 = or i64 %indvars.iv10181, 1
+  br i1 %cmp378.us, label %for.body380.us, label %for.body99.us
+
+for.body194.us:                                   ; preds = %for.inc371.us, %for.body99.us
+  %mul287986.us = phi i16 [ %mul287985.us, %for.body99.us ], [ %mul287.us, %for.inc371.us ]
+  %mul354906.us = phi i64 [ %mul354905.us, %for.body99.us ], [ %var_11, %for.inc371.us ]
+  %sub283897.us = phi i64 [ %sub283896.us, %for.body99.us ], [ 0, %for.inc371.us ]
+  store i16 %conv227.us, ptr %gep876.us, align 2
+  store i8 -107, ptr %arr_7, align 1
+  %sub283.us = or i64 %sub283897.us, -434259939
+  store i64 %sub283.us, ptr %arr_4, align 8
+  %mul287.us = mul i16 %mul287986.us, -18978
+  store i64 0, ptr %1, align 8
+  store i64 %var_2, ptr %arr_3, align 8
+  br i1 %tobool435.not.us, label %if.then327.us, label %for.inc371.us
+
+if.then327.us:                                    ; preds = %for.body194.us
+  %tobool330.not.us = icmp eq i32 0, 0
+  br i1 %tobool330.not.us, label %cond.end345.us, label %cond.true331.us
+
+cond.true331.us:                                  ; preds = %if.then327.us
+  %4 = load i8, ptr null, align 1
+  %5 = sext i8 %4 to i16
+  br label %cond.end345.us
+
+cond.end345.us:                                   ; preds = %cond.true331.us, %if.then327.us
+  %cond346.us = phi i16 [ %5, %cond.true331.us ], [ 0, %if.then327.us ]
+  store i16 %cond346.us, ptr %arr_3, align 2
+  store i64 %mul354906.us, ptr %arr_13, align 8
+  br label %for.inc371.us
+
+for.inc371.us:                                    ; preds = %cond.end345.us, %for.body194.us
+  %mul354907.us = phi i64 [ 1, %for.body194.us ], [ 0, %cond.end345.us ]
+  br i1 %var_0, label %for.body194.us, label %for.cond376.preheader.us
+
+for.cond376.preheader.us:                         ; preds = %for.inc371.us
+  %arrayidx384.us9 = getelementptr i16, ptr null, i64 %conv96880.us4
+  br label %for.body380.us
+}
+
+attributes #0 = { "frame-pointer"="non-leaf" }
diff --git a/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll
index 2a77d4dd33fe5..4206c0bc26991 100644
--- a/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_nonecc_varargs_darwin.ll
@@ -27,11 +27,12 @@ define i32 @caller() nounwind ssp {
 ; CHECK-NEXT:    sub sp, sp, #208
 ; CHECK-NEXT:    mov w8, #10 ; =0xa
 ; CHECK-NEXT:    mov w9, #9 ; =0x9
-; CHECK-NEXT:    mov w10, #8 ; =0x8
+; CHECK-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-NEXT:    stp x9, x8, [sp, #24]
-; CHECK-NEXT:    mov w8, #7 ; =0x7
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    mov w9, #6 ; =0x6
-; CHECK-NEXT:    mov w0, #1 ; =0x1
+; CHECK-NEXT:    str x8, [sp, #16]
+; CHECK-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-NEXT:    mov w1, #2 ; =0x2
 ; CHECK-NEXT:    mov w2, #3 ; =0x3
 ; CHECK-NEXT:    mov w3, #4 ; =0x4
@@ -46,8 +47,7 @@ define i32 @caller() nounwind ssp {
 ; CHECK-NEXT:    stp x22, x21, [sp, #160] ; 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #176] ; 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #192] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x8, x10, [sp, #8]
-; CHECK-NEXT:    str x9, [sp]
+; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    bl _callee
 ; CHECK-NEXT:    ldp x29, x30, [sp, #192] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #176] ; 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-implicit-def-subreg-to-reg.mir b/llvm/test/CodeGen/AArch64/register-coalesce-implicit-def-subreg-to-reg.mir
new file mode 100644
index 0000000000000..a58a23068896b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/register-coalesce-implicit-def-subreg-to-reg.mir
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -start-before=register-coalescer -stop-after=virtregrewriter -enable-subreg-liveness=false -o - %s | FileCheck %s
+# RUN: llc -mtriple=aarch64 -start-before=register-coalescer -stop-after=virtregrewriter -enable-subreg-liveness=true -o - %s | FileCheck %s
+---
+name: test
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x0 = COPY $x1
+    ; CHECK-NEXT: renamable $w1 = ORRWrr $wzr, renamable $w0, implicit-def renamable $x1
+    ; CHECK-NEXT: RET_ReallyLR implicit $x1, implicit $x0
+    %190:gpr64 = COPY killed $x1
+    %191:gpr32 = COPY %190.sub_32:gpr64
+    %192:gpr32 = ORRWrr $wzr, killed %191:gpr32
+    %193:gpr64all = SUBREG_TO_REG 0, killed %192:gpr32, %subreg.sub_32
+    $x0 = COPY killed %190:gpr64
+    $x1 = COPY killed %193:gpr64all
+    RET_ReallyLR implicit $x1, implicit $x0
+...
+
+# In this test, we should avoid adding an implicit-def to ORRXri, because
+# the register class will already be gpr64sp.
+---
+name: test2
+tracksRegLiveness: true
+frameInfo:
+  adjustsStack:    true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test2
+    ; CHECK: renamable $x8 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $x9 = ORRXri renamable $x8, 8128
+    ; CHECK-NEXT: $x2 = ORRXri renamable $x8, 8128
+    ; CHECK-NEXT: RET_ReallyLR implicit killed renamable $x8, implicit killed renamable $x9
+    %0:gpr64 = IMPLICIT_DEF
+    %1:gpr64sp = ORRXri %0, 8128
+    %3:gpr64 = SUBREG_TO_REG 0, %1.sub_32, %subreg.sub_32
+    %2:gpr64all = COPY killed %1
+    $x2 = COPY killed %2
+    RET_ReallyLR implicit %0, implicit %3
+...
diff --git a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
index 08fc47d9480ce..eb6242ce9940d 100644
--- a/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
+++ b/llvm/test/CodeGen/AArch64/register-coalesce-update-subranges-remat.mir
@@ -7,9 +7,18 @@
 # CHECK-DBG: ********** JOINING INTERVALS ***********
 # CHECK-DBG: ********** INTERVALS **********
 # CHECK-DBG: %0 [16r,32r:0) 0@16r  weight:0.000000e+00
-# CHECK-DBG: %3 [48r,112r:0) 0@48r  L0000000000000040 [48r,112r:0) 0@48r  weight:0.000000e+00
-# CHECK-DBG: %4 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  L0000000000000080 [112e,112d:0) 0@112e  L0000000000000040 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  weight:0.000000e+00
+# CHECK-DBG: %3 [48r,112r:0) 0@48r  L0000000000000080 [48r,112r:0) 0@48r  L0000000000000040 [48r,112r:0) 0@48r  weight:0.000000e+00
+# CHECK-DBG: %4 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  L0000000000000080 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  L0000000000000040 [80r,112e:1)[112e,112d:0) 0@112e 1@80r  weight:0.000000e+00
 # CHECK-DBG: %5 [32r,112r:1)[112r,112d:0) 0@112r 1@32r  weight:0.000000e+00
+# CHECK-DBG: ********** MACHINEINSTRS **********
+# CHECK-DBG: 0B      bb.0.entry:
+# CHECK-DBG: 16B       %0:gpr64sp = ADDXri %stack.0, 0, 0
+# CHECK-DBG: 32B       %5:gpr64common = nuw ADDXri %0:gpr64sp, 64, 0
+# CHECK-DBG: 48B       undef %3.sub_32:gpr64 = MOVi32imm 64, implicit-def %3:gpr64
+# CHECK-DBG: 80B       undef %4.sub_32:gpr64 = MOVi32imm 64, implicit-def %4:gpr64
+# CHECK-DBG: 112B      dead %5:gpr64common, dead early-clobber %4:gpr64 = MOPSMemorySetPseudo %5:gpr64common(tied-def 0), %4:gpr64(tied-def 1), %3:gpr64, implicit-def dead $nzcv
+# CHECK-DBG: 128B      RET_ReallyLR
+
 ---
 name:            test
 tracksRegLiveness: true
@@ -43,9 +52,44 @@ body:             |
 # CHECK-DBG: %1 [32r,48B:2)[48B,320r:0)[320r,368B:1) 0@48B-phi 1@320r 2@32r
 # CHECK-DBG-SAME: weight:0.000000e+00
 # CHECK-DBG: %3 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
-# CHECK-DBG-SAME: L0000000000000080 [288r,304B:0)[304B,320r:3) 0@288r 1@x 2@x 3@304B-phi
+# CHECK-DBG-SAME: L0000000000000080 [240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@x 3@304B-phi
 # CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[240r,272B:1)[288r,304B:0)[304B,320r:3) 0@288r 1@240r 2@80r 3@304B-phi
 # CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: ********** MACHINEINSTRS **********
+# CHECK-DBG: 0B      bb.0:
+# CHECK-DBG:           successors: %bb.1(0x80000000); %bb.1(100.00%)
+# CHECK-DBG: 32B       %1:gpr64 = IMPLICIT_DEF
+# CHECK-DBG: 48B     bb.1:
+# CHECK-DBG:         ; predecessors: %bb.0, %bb.7
+# CHECK-DBG:           successors: %bb.2(0x80000000); %bb.2(100.00%)
+# CHECK-DBG: 64B     bb.2:
+# CHECK-DBG:         ; predecessors: %bb.1
+# CHECK-DBG:           successors: %bb.3(0x80000000); %bb.3(100.00%)
+# CHECK-DBG: 80B       undef %3.sub_32:gpr64 = MOVi32imm 1
+# CHECK-DBG: 96B     bb.3:
+# CHECK-DBG:         ; predecessors: %bb.2
+# CHECK-DBG:           successors: %bb.7(0x40000000), %bb.4(0x40000000); %bb.7(50.00%), %bb.4(50.00%)
+# CHECK-DBG: 112B      $nzcv = IMPLICIT_DEF
+# CHECK-DBG: 144B      Bcc 1, %bb.7, implicit killed $nzcv
+# CHECK-DBG: 160B    bb.4:
+# CHECK-DBG:         ; predecessors: %bb.3
+# CHECK-DBG:           successors: %bb.6(0x40000000), %bb.5(0x40000000); %bb.6(50.00%), %bb.5(50.00%)
+# CHECK-DBG: 176B      $nzcv = IMPLICIT_DEF
+# CHECK-DBG: 192B      Bcc 1, %bb.6, implicit killed $nzcv
+# CHECK-DBG: 208B    bb.5:
+# CHECK-DBG:         ; predecessors: %bb.4
+# CHECK-DBG:           successors: %bb.7(0x80000000); %bb.7(100.00%)
+# CHECK-DBG: 240B      undef %3.sub_32:gpr64 = MOVi32imm 1, implicit-def %3:gpr64
+# CHECK-DBG: 256B      B %bb.7
+# CHECK-DBG: 272B    bb.6:
+# CHECK-DBG:         ; predecessors: %bb.4
+# CHECK-DBG:           successors: %bb.7(0x80000000); %bb.7(100.00%)
+# CHECK-DBG: 288B      %3:gpr64 = COPY $xzr
+# CHECK-DBG: 304B    bb.7:
+# CHECK-DBG:         ; predecessors: %bb.3, %bb.5, %bb.6
+# CHECK-DBG:           successors: %bb.1(0x80000000); %bb.1(100.00%)
+# CHECK-DBG: 320B      %1:gpr64 = ADDXrs %1:gpr64, %3:gpr64, 1
+# CHECK-DBG: 352B      B %bb.1
 ---
 name:              reproducer
 tracksRegLiveness: true
@@ -92,6 +136,42 @@ body:             |
 # CHECK-DBG-SAME: L0000000000000080 [224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@x 3@288B-phi
 # CHECK-DBG-SAME: L0000000000000040 [80r,160B:2)[224r,256B:1)[272r,288B:0)[288B,304r:3) 0@272r 1@224r 2@80r 3@288B-phi
 # CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: ********** MACHINEINSTRS **********
+# CHECK-DBG: 0B      bb.0:
+# CHECK-DBG:           successors: %bb.1(0x80000000); %bb.1(100.00%)
+# CHECK-DBG: 32B       %1:gpr64 = IMPLICIT_DEF
+# CHECK-DBG: 48B     bb.1:
+# CHECK-DBG:         ; predecessors: %bb.0, %bb.7
+# CHECK-DBG:           successors: %bb.2(0x80000000); %bb.2(100.00%)
+# CHECK-DBG: 64B     bb.2:
+# CHECK-DBG:         ; predecessors: %bb.1
+# CHECK-DBG:           successors: %bb.3(0x80000000); %bb.3(100.00%)
+# CHECK-DBG: 80B       undef %3.sub_32:gpr64 = MOVi32imm 1
+# CHECK-DBG: 96B     bb.3:
+# CHECK-DBG:         ; predecessors: %bb.2
+# CHECK-DBG:           successors: %bb.7(0x40000000), %bb.4(0x40000000); %bb.7(50.00%), %bb.4(50.00%)
+# CHECK-DBG: 112B      $nzcv = IMPLICIT_DEF
+# CHECK-DBG: 144B      Bcc 1, %bb.7, implicit killed $nzcv
+# CHECK-DBG: 160B    bb.4:
+# CHECK-DBG:         ; predecessors: %bb.3
+# CHECK-DBG:           successors: %bb.6(0x40000000), %bb.5(0x40000000); %bb.6(50.00%), %bb.5(50.00%)
+# CHECK-DBG: 176B      $nzcv = IMPLICIT_DEF
+# CHECK-DBG: 192B      Bcc 1, %bb.6, implicit killed $nzcv
+# CHECK-DBG: 208B    bb.5:
+# CHECK-DBG:         ; predecessors: %bb.4
+# CHECK-DBG:           successors: %bb.7(0x80000000); %bb.7(100.00%)
+# CHECK-DBG: 224B      %3:gpr64 = IMPLICIT_DEF
+# CHECK-DBG: 240B      B %bb.7
+# CHECK-DBG: 256B    bb.6:
+# CHECK-DBG:         ; predecessors: %bb.4
+# CHECK-DBG:           successors: %bb.7(0x80000000); %bb.7(100.00%)
+# CHECK-DBG: 272B      %3:gpr64 = COPY $xzr
+# CHECK-DBG: 288B    bb.7:
+# CHECK-DBG:         ; predecessors: %bb.3, %bb.5, %bb.6
+# CHECK-DBG:           successors: %bb.1(0x80000000); %bb.1(100.00%)
+# CHECK-DBG: 304B      %1:gpr64 = ADDXrs %1:gpr64, %3:gpr64, 1
+# CHECK-DBG: 336B      B %bb.1
+
 ---
 name:              reproducer2
 tracksRegLiveness: true
@@ -127,3 +207,78 @@ body:             |
     B %bb.1
 
 ...
+# CHECK-DBG: ********** REGISTER COALESCER **********
+# CHECK-DBG: ********** Function: reproducer3
+# CHECK-DBG: ********** JOINING INTERVALS ***********
+# CHECK-DBG: ********** INTERVALS **********
+# CHECK-DBG: W0 [0B,32r:0)[320r,336r:1) 0@0B-phi 1@320r
+# CHECK-DBG: W1 [0B,16r:0) 0@0B-phi
+# CHECK-DBG: %0 [16r,64r:0) 0@16r  weight:0.000000e+00
+# CHECK-DBG: %1 [32r,128r:0) 0@32r  weight:0.000000e+00
+# CHECK-DBG: %2 [48r,64r:0) 0@48r  weight:0.000000e+00
+# CHECK-DBG: %3 [64r,80r:0) 0@64r  weight:0.000000e+00
+# CHECK-DBG: %4 [80r,176r:0) 0@80r  weight:0.000000e+00
+# CHECK-DBG: %7 [112r,128r:1)[128r,256r:0)[304B,320r:0) 0@128r 1@112r
+# CHECK-DBG-SAME: L0000000000000080 [128r,256r:0)[304B,320r:0) 0@128r
+# CHECK-DBG-SAME: L0000000000000040 [112r,128r:1)[128r,256r:0)[304B,320r:0) 0@128r 1@112r
+# CHECK-DBG-SAME: weight:0.000000e+00
+# CHECK-DBG: %8 [96r,176r:1)[176r,192r:0) 0@176r 1@96r  weight:0.000000e+00
+# CHECK-DBG: %9 [256r,272r:0) 0@256r  weight:0.000000e+00
+# CHECK-DBG: ********** MACHINEINSTRS **********
+# CHECK-DBG: 0B      bb.0:
+# CHECK-DBG:           successors: %bb.2(0x40000000), %bb.1(0x40000000); %bb.2(50.00%), %bb.1(50.00%)
+# CHECK-DBG:           liveins: $w0, $w1
+# CHECK-DBG: 16B       %0:gpr32 = COPY $w1
+# CHECK-DBG: 32B       %1:gpr32 = COPY $w0
+# CHECK-DBG: 48B       %2:gpr32 = UBFMWri %1:gpr32, 31, 30
+# CHECK-DBG: 64B       %3:gpr32 = SUBWrs %2:gpr32, %0:gpr32, 1
+# CHECK-DBG: 80B       %4:gpr32 = UBFMWri %3:gpr32, 1, 31
+# CHECK-DBG: 96B       %8:gpr32common = MOVi32imm 1
+# CHECK-DBG: 112B      undef %7.sub_32:gpr64 = MOVi32imm 1
+# CHECK-DBG: 128B      undef %7.sub_32:gpr64 = BFMWri %7.sub_32:gpr64(tied-def 0), %1:gpr32, 31, 30, implicit-def %7:gpr64
+# CHECK-DBG: 176B      %8:gpr32common = BFMWri %8:gpr32common(tied-def 0), %4:gpr32, 30, 29
+# CHECK-DBG: 192B      dead $wzr = SUBSWri %8:gpr32common, 0, 0, implicit-def $nzcv
+# CHECK-DBG: 208B      Bcc 2, %bb.2, implicit killed $nzcv
+# CHECK-DBG: 224B      B %bb.1
+# CHECK-DBG: 240B    bb.1:
+# CHECK-DBG:         ; predecessors: %bb.0
+# CHECK-DBG: 256B      %9:gpr64common = UBFMXri %7:gpr64, 62, 61
+# CHECK-DBG: 272B      dead $xzr = LDRXui %9:gpr64common, 0
+# CHECK-DBG: 288B      RET_ReallyLR
+# CHECK-DBG: 304B    bb.2:
+# CHECK-DBG:         ; predecessors: %bb.0
+# CHECK-DBG: 320B      $x0 = COPY %7:gpr64
+# CHECK-DBG: 336B      RET_ReallyLR implicit $x0
+
+---
+name:            reproducer3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    %0:gpr32 = COPY killed $w1
+    %1:gpr32 = COPY killed $w0
+    %3:gpr32 = UBFMWri %1, 31, 30
+    %4:gpr32 = SUBWrs killed %3, killed %0, 1
+    %5:gpr32 = UBFMWri killed %4, 1, 31
+    %6:gpr32 = MOVi32imm 1
+    %7:gpr32 = COPY %6
+    %7:gpr32 = BFMWri %7, killed %1, 31, 30
+    %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32
+    %9:gpr32common = COPY killed %6
+    %9:gpr32common = BFMWri %9, killed %5, 30, 29
+    dead $wzr = SUBSWri killed %9, 0, 0, implicit-def $nzcv
+    Bcc 2, %bb.2, implicit killed $nzcv
+    B %bb.1
+
+  bb.1:
+    %10:gpr64common = UBFMXri killed %8, 62, 61
+    dead $xzr = LDRXui killed %10, 0
+    RET_ReallyLR
+
+  bb.2:
+    $x0 = COPY killed %8
+    RET_ReallyLR implicit killed $x0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
new file mode 100644
index 0000000000000..96cf528056cb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll
@@ -0,0 +1,340 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define amdgpu_ps void @v_fabs_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GCN-NEXT:    global_store_b16 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call half @llvm.fabs.f16(half %in)
+  store half %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f16(half inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fff, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call half @llvm.fabs.f16(half %in)
+  store half %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f16_salu_use(half inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f16_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fff, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f16_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call half @llvm.fabs.f16(half %in)
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, half %fabs, half 0.0
+  store half %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f32_salu_use(float inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f32_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f32_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, float %fabs, float 0.0
+  store float %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f64(double inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: s_fabs_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GCN-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT:    s_endpgm
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_f64_salu_use(double inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_f64_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_f64_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call double @llvm.fabs.f64(double %in)
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, double %fabs, double 0.0
+  store double %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  store <2 x half> %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fff7fff, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX12-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT:    s_and_b32 s1, s1, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  store <2 x half> %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_v2f16_salu_use(<2 x half> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f16_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fff7fff, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f16_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX12-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, <2 x half> %fabs, <2 x half> <half 0.0, half 0.0>
+  store <2 x half> %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT:    v_and_b32_e64 v3, 0x7fffffff, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-NEXT:    s_bitset0_b32 s1, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_v2f32_salu_use(<2 x float> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_v2f32_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_and_b32_e64 v2, 0x7fffffff, s0
+; GFX11-NEXT:    v_and_b32_e64 v3, 0x7fffffff, s1
+; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_v2f32_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-NEXT:    s_bitset0_b32 s1, 31
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, <2 x float> %fabs, <2 x float> <float 0.0, float 0.0>
+  store <2 x float> %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fabs_fneg_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fabs_fneg_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fneg = fneg float %fabs
+  store float %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fabs_fneg_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fabs_fneg_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_or_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fabs_fneg_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitset1_b32 s0, 31
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fneg = fneg float %fabs
+  store float %fneg, ptr addrspace(1) %out
+  ret void
+}
+
+declare half @llvm.fabs.f16(half)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
new file mode 100644
index 0000000000000..8a260926d0a4f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define amdgpu_ps void @v_fneg_f16(half %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    global_store_b16 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg half %in
+  store half %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f16(half inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x8000, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg half %in
+  store half %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f16_salu_use(half inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f16_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x8000, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f16_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg half %in
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, half %fneg, half 0.0
+  store half %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_f32(float %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg float %in
+  store float %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f32(float inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg float %in
+  store float %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f32_salu_use(float inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f32_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f32_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg float %in
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, float %fneg, float 0.0
+  store float %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_f64(double %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg double %in
+  store double %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f64(double inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: s_fneg_f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg double %in
+  store double %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_f64_salu_use(double inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_f64_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_f64_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg double %in
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, double %fneg, double 0.0
+  store double %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f16(<2 x half> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GCN-NEXT:    global_store_b32 v[1:2], v0, off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg <2 x half> %in
+  store <2 x half> %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_v2f16(<2 x half> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x80008000, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT:    s_xor_b32 s1, s1, 0x8000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg <2 x half> %in
+  store <2 x half> %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_v2f16_salu_use(<2 x half> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f16_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x80008000, s0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f16_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX12-NEXT:    s_xor_b32 s2, s2, 0x8000
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg <2 x half> %in
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, <2 x half> %fneg, <2 x half> <half 0.0, half 0.0>
+  store <2 x half> %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @v_fneg_v2f32(<2 x float> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: v_fneg_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GCN-NEXT:    s_endpgm
+  %fneg = fneg <2 x float> %in
+  store <2 x float> %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_v2f32(<2 x float> inreg %in, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT:    v_xor_b32_e64 v3, 0x80000000, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT:    s_xor_b32 s1, s1, 0x80000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg <2 x float> %in
+  store <2 x float> %fneg, ptr addrspace(1) %out
+  ret void
+}
+define amdgpu_ps void @s_fneg_v2f32_salu_use(<2 x float> inreg %in, i32 inreg %val, ptr addrspace(1) %out) {
+; GFX11-LABEL: s_fneg_v2f32_salu_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_xor_b32_e64 v2, 0x80000000, s0
+; GFX11-NEXT:    v_xor_b32_e64 v3, 0x80000000, s1
+; GFX11-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_fneg_v2f32_salu_use:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-NEXT:    s_xor_b32 s1, s1, 0x80000000
+; GFX12-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX12-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %fneg = fneg <2 x float> %in
+  %cond = icmp eq i32 %val, 0
+  %sel = select i1 %cond, <2 x float> %fneg, <2 x float> <float 0.0, float 0.0>
+  store <2 x float> %sel, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/BPF/atomic-oversize.ll b/llvm/test/CodeGen/BPF/atomic-oversize.ll
index 187f0964d4fb8..6dc49398f091d 100644
--- a/llvm/test/CodeGen/BPF/atomic-oversize.ll
+++ b/llvm/test/CodeGen/BPF/atomic-oversize.ll
@@ -1,6 +1,4 @@
 ; RUN: llc -mtriple=bpf < %s | FileCheck %s
-; XFAIL: *
-; Doesn't currently build, with error 'only small returns supported'.
 
 define void @test(ptr %a) nounwind {
 ; CHECK-LABEL: test:
diff --git a/llvm/test/CodeGen/BPF/builtin_calls.ll b/llvm/test/CodeGen/BPF/builtin_calls.ll
new file mode 100644
index 0000000000000..18199eba7222a
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/builtin_calls.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=bpfel -mattr=+allow-builtin-calls < %s | FileCheck %s
+;
+; C code for this test case:
+;
+; long func(long a, long b) {
+;     long x;
+;     return __builtin_mul_overflow(a, b, &x);
+; }
+
+
+declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
+
+define noundef range(i64 0, 2) i64 @func(i64 noundef %a, i64 noundef %b) local_unnamed_addr {
+entry:
+  %0 = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
+  %1 = extractvalue { i64, i1 } %0, 1
+  %conv = zext i1 %1 to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: func
+; CHECK: r4 = r2
+; CHECK: r2 = r1
+; CHECK: r3 = r2
+; CHECK: r3 s>>= 63
+; CHECK: r5 = r4
+; CHECK: r5 s>>= 63
+; CHECK: r1 = r10
+; CHECK: r1 += -16
+; CHECK: call __multi3
+; CHECK: r1 = *(u64 *)(r10 - 16)
+; CHECK: r1 s>>= 63
+; CHECK: w0 = 1
+; CHECK: r2 = *(u64 *)(r10 - 8)
+; CHECK: if r2 != r1 goto LBB0_2
+; CHECK:  # %bb.1:                                # %entry
+; CHECK: w0 = 0
+; CHECK:  LBB0_2:                                 # %entry
+; CHECK: exit
\ No newline at end of file
diff --git a/llvm/test/CodeGen/BPF/struct_ret1.ll b/llvm/test/CodeGen/BPF/struct_ret1.ll
index 40d17ec514c48..eb66a7deacb91 100644
--- a/llvm/test/CodeGen/BPF/struct_ret1.ll
+++ b/llvm/test/CodeGen/BPF/struct_ret1.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=bpf < %s 2> %t1
 ; RUN: FileCheck %s < %t1
-; CHECK: error: <unknown>:0:0: in function bar { i64, i32 } (i32, i32, i32, i32, i32): aggregate returns are not supported
+; CHECK: error: <unknown>:0:0: in function bar { i64, i32 } (i32, i32, i32, i32, i32): stack arguments are not supported
 
 %struct.S = type { i32, i32, i32 }
 
diff --git a/llvm/test/CodeGen/BPF/struct_ret2.ll b/llvm/test/CodeGen/BPF/struct_ret2.ll
index 170d55cc29df0..a20280949215e 100644
--- a/llvm/test/CodeGen/BPF/struct_ret2.ll
+++ b/llvm/test/CodeGen/BPF/struct_ret2.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -mtriple=bpf < %s 2> %t1
 ; RUN: FileCheck %s < %t1
-; CHECK: only small returns
+; CHECK: too many arguments
 
 ; Function Attrs: nounwind uwtable
 define { i64, i32 } @foo(i32 %a, i32 %b, i32 %c) #0 {
diff --git a/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll b/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll
index 156a8e7c5c386..def886f933d08 100644
--- a/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll
+++ b/llvm/test/CodeGen/DirectX/bugfix_150050_data_scalarize_const_gep.ll
@@ -11,9 +11,10 @@ define void @CSMain() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[AFRAGPACKED_I_SCALARIZE:%.*]] = alloca [4 x i32], align 16
 ;
-; SCHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [10 x <4 x i32>], ptr addrspace(3) getelementptr inbounds ([10 x [10 x [4 x i32]]], ptr addrspace(3) @aTile.scalarized, i32 0, i32 1), i32 0, i32 2
-; SCHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(3) [[TMP0]], align 16
-; SCHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[AFRAGPACKED_I_SCALARIZE]], align 16
+; SCHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds [10 x [10 x [4 x i32]]], ptr addrspace(3) @aTile.scalarized, i32 0, i32 1
+; SCHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [10 x [4 x i32]], ptr addrspace(3) [[GEP0]], i32 0, i32 2
+; SCHECK-NEXT:    [[LOAD:%.*]] = load <4 x i32>, ptr addrspace(3) [[GEP1]], align 16
+; SCHECK-NEXT:    store <4 x i32> [[LOAD]], ptr [[AFRAGPACKED_I_SCALARIZE]], align 16
 ;
 ; FCHECK-NEXT:    [[AFRAGPACKED_I_SCALARIZE_I14:%.*]] = getelementptr [4 x i32], ptr [[AFRAGPACKED_I_SCALARIZE]], i32 0, i32 1
 ; FCHECK-NEXT:    [[AFRAGPACKED_I_SCALARIZE_I25:%.*]] = getelementptr [4 x i32], ptr [[AFRAGPACKED_I_SCALARIZE]], i32 0, i32 2
@@ -40,12 +41,13 @@ define void @Main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[BFRAGPACKED_I:%.*]] = alloca i32, align 16
 ;
-; SCHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [10 x i32], ptr addrspace(3) getelementptr inbounds ([10 x [10 x i32]], ptr addrspace(3) @bTile, i32 0, i32 1), i32 0, i32 1
-; SCHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) [[TMP0]], align 16
-; SCHECK-NEXT:    store i32 [[TMP1]], ptr [[BFRAGPACKED_I]], align 16
+; SCHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr addrspace(3) @bTile, i32 0, i32 1
+; SCHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [10 x i32], ptr addrspace(3) [[GEP0]], i32 0, i32 1
+; SCHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(3) [[GEP1]], align 16
+; SCHECK-NEXT:    store i32 [[LOAD]], ptr [[BFRAGPACKED_I]], align 16
 ;
-; FCHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([100 x i32], ptr addrspace(3) @bTile.1dim, i32 0, i32 11), align 16
-; FCHECK-NEXT:    store i32 [[TMP0]], ptr [[BFRAGPACKED_I]], align 16
+; FCHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([100 x i32], ptr addrspace(3) @bTile.1dim, i32 0, i32 11), align 16
+; FCHECK-NEXT:    store i32 [[LOAD]], ptr [[BFRAGPACKED_I]], align 16
 ;
 ; CHECK-NEXT:    ret void
 entry:
@@ -57,10 +59,12 @@ entry:
 
 define void @global_nested_geps_3d() {
 ; CHECK-LABEL: define void @global_nested_geps_3d() {
-; SCHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr getelementptr inbounds ([2 x <2 x i32>], ptr getelementptr inbounds ([2 x [2 x [2 x i32]]], ptr @cTile.scalarized, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1
-; SCHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+; SCHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr @cTile.scalarized, i32 0, i32 1
+; SCHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[GEP0]], i32 0, i32 1
+; SCHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[GEP1]], i32 0, i32 1
+; SCHECK-NEXT:    load i32, ptr [[GEP2]], align 4
 ;
-; FCHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @cTile.scalarized.1dim, i32 0, i32 7), align 4
+; FCHECK-NEXT:    load i32, ptr getelementptr inbounds ([8 x i32], ptr @cTile.scalarized.1dim, i32 0, i32 7), align 4
 ;
 ; CHECK-NEXT:    ret void
   %1 = load i32, i32* getelementptr inbounds (<2 x i32>, <2 x i32>* getelementptr inbounds ([2 x <2 x i32>], [2 x <2 x i32>]* getelementptr inbounds ([2 x [2 x <2 x i32>]], [2 x [2 x <2 x i32>]]* @cTile, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), align 4
@@ -69,10 +73,13 @@ define void @global_nested_geps_3d() {
 
 define void @global_nested_geps_4d() {
 ; CHECK-LABEL: define void @global_nested_geps_4d() {
-; SCHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i32>, ptr getelementptr inbounds ([2 x <2 x i32>], ptr getelementptr inbounds ([2 x [2 x <2 x i32>]], ptr getelementptr inbounds ([2 x [2 x [2 x [2 x i32]]]], ptr @dTile.scalarized, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), i32 0, i32 1
-; SCHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+; SCHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], ptr @dTile.scalarized, i32 0, i32 1
+; SCHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [2 x [2 x [2 x i32]]], ptr [[GEP0]], i32 0, i32 1
+; SCHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [2 x [2 x i32]], ptr [[GEP1]], i32 0, i32 1
+; SCHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[GEP2]], i32 0, i32 1
+; SCHECK-NEXT:    load i32, ptr [[GEP3]], align 4
 ;
-; FCHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @dTile.scalarized.1dim, i32 0, i32 15), align 4
+; FCHECK-NEXT:    load i32, ptr getelementptr inbounds ([16 x i32], ptr @dTile.scalarized.1dim, i32 0, i32 15), align 4
 ;
 ; CHECK-NEXT:    ret void
   %1 = load i32, i32* getelementptr inbounds (<2 x i32>, <2 x i32>* getelementptr inbounds ([2 x <2 x i32>], [2 x <2 x i32>]* getelementptr inbounds ([2 x [2 x <2 x i32>]], [2 x [2 x <2 x i32>]]* getelementptr inbounds ([2 x [2 x [2 x <2 x i32>]]], [2 x [2 x [2 x <2 x i32>]]]* @dTile, i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), i32 0, i32 1), align 4
diff --git a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
index 475935d2eb135..85e3bb0185e44 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-alloca.ll
@@ -48,7 +48,7 @@ define void @subtype_array_test() {
   ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
   ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
-  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[alloca_val]], i32 [[tid]]
   ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
   ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
   ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
@@ -64,7 +64,7 @@ define void @subtype_vector_test() {
   ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
   ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
-  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 [[tid]]
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[alloca_val]], i32 [[tid]]
   ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
   ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
   ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
@@ -80,7 +80,7 @@ define void @subtype_scalar_test() {
   ; SCHECK:  [[alloca_val:%.*]] = alloca [8 x [4 x i32]], align 4
   ; FCHECK:  [[alloca_val:%.*]] = alloca [32 x i32], align 4
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
-  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr [[alloca_val]], i32 0, i32 0, i32 [[tid]]
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i32, ptr [[alloca_val]], i32 [[tid]]
   ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
   ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
   ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr [[alloca_val]], i32 0, i32 [[flatidx]]
diff --git a/llvm/test/CodeGen/DirectX/scalarize-global.ll b/llvm/test/CodeGen/DirectX/scalarize-global.ll
index ca10f6ece5a85..c27dc4083bfd3 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-global.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-global.ll
@@ -11,7 +11,7 @@
 ; CHECK-LABEL: subtype_array_test
 define <4 x i32> @subtype_array_test() {
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
-  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]]
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) [[arrayofVecData]], i32 [[tid]]
   ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
   ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
   ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
@@ -26,7 +26,7 @@ define <4 x i32> @subtype_array_test() {
 ; CHECK-LABEL: subtype_vector_test
 define <4 x i32> @subtype_vector_test() {
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
-  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[tid]]
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) [[arrayofVecData]], i32 [[tid]]
   ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 4
   ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
   ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
@@ -41,7 +41,7 @@ define <4 x i32> @subtype_vector_test() {
 ; CHECK-LABEL: subtype_scalar_test
 define <4 x i32> @subtype_scalar_test() {
   ; CHECK: [[tid:%.*]] = tail call i32 @llvm.dx.thread.id(i32 0)
-  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw [8 x [4 x i32]], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 0, i32 [[tid]]
+  ; SCHECK: [[gep:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(3) [[arrayofVecData]], i32 [[tid]]
   ; FCHECK: [[flatidx_mul:%.*]] = mul i32 [[tid]], 1
   ; FCHECK: [[flatidx:%.*]] = add i32 0, [[flatidx_mul]]
   ; FCHECK: [[gep:%.*]] = getelementptr inbounds nuw [32 x i32], ptr addrspace(3) [[arrayofVecData]], i32 0, i32 [[flatidx]]
diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
index d09ef0e2c6ac0..c12dbe488263f 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
@@ -1532,11 +1532,11 @@ define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float
 ; CHECK-NEXT:    # kill: def $f7 killed $f7 def $vr7
 ; CHECK-NEXT:    # kill: def $f6 killed $f6 def $vr6
 ; CHECK-NEXT:    # kill: def $f5 killed $f5 def $vr5
-; CHECK-NEXT:    # kill: def $f4 killed $f4 def $xr4
+; CHECK-NEXT:    # kill: def $f4 killed $f4 def $vr4 def $xr4
 ; CHECK-NEXT:    # kill: def $f3 killed $f3 def $vr3
 ; CHECK-NEXT:    # kill: def $f2 killed $f2 def $vr2
 ; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; CHECK-NEXT:    vextrins.w $vr4, $vr5, 16
 ; CHECK-NEXT:    vextrins.w $vr4, $vr6, 32
 ; CHECK-NEXT:    vextrins.w $vr4, $vr7, 48
@@ -1619,7 +1619,7 @@ define void @buildvector_v8f32_subseq_2(ptr %dst, float %a0, float %a1, float %a
 ; CHECK-NEXT:    # kill: def $f3 killed $f3 def $vr3
 ; CHECK-NEXT:    # kill: def $f2 killed $f2 def $vr2
 ; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; CHECK-NEXT:    vextrins.w $vr0, $vr1, 16
 ; CHECK-NEXT:    vextrins.w $vr0, $vr2, 32
 ; CHECK-NEXT:    vextrins.w $vr0, $vr3, 48
@@ -1643,7 +1643,7 @@ define void @buildvector_v8f32_subseq_4(ptr %dst, float %a0, float %a1) nounwind
 ; CHECK-LABEL: buildvector_v8f32_subseq_4:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; CHECK-NEXT:    vextrins.w $vr0, $vr1, 16
 ; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
@@ -1665,9 +1665,9 @@ define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, dou
 ; CHECK-LABEL: buildvector_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f3_64 killed $f3_64 def $vr3
-; CHECK-NEXT:    # kill: def $f2_64 killed $f2_64 def $xr2
+; CHECK-NEXT:    # kill: def $f2_64 killed $f2_64 def $vr2 def $xr2
 ; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $vr1
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; CHECK-NEXT:    vextrins.d $vr2, $vr3, 16
 ; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
@@ -1722,7 +1722,7 @@ define void @buildvector_v4f64_subseq_2(ptr %dst, double %a0, double %a1) nounwi
 ; CHECK-LABEL: buildvector_v4f64_subseq_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $vr1
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    xvreplve0.q $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 45b25013c9173..d01848e8547b6 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -22,7 +22,7 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl __powisf2
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; LA32-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
 ; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
 ; LA32-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
@@ -56,7 +56,7 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; LA32-NEXT:    # kill: def $f0 killed $f0 killed $xr0
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl __powisf2
-; LA32-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; LA32-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
 ; LA32-NEXT:    vextrins.w $vr0, $vr1, 16
 ; LA32-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
@@ -105,7 +105,7 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
 ; LA64-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
@@ -143,7 +143,7 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vextrins.w $vr0, $vr1, 16
 ; LA64-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
@@ -198,7 +198,7 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl __powidf2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; LA32-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
 ; LA32-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
@@ -214,7 +214,7 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 killed $xr0
 ; LA32-NEXT:    move $a0, $fp
 ; LA32-NEXT:    bl __powidf2
-; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; LA32-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; LA32-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
 ; LA32-NEXT:    vextrins.d $vr0, $vr1, 16
 ; LA32-NEXT:    xvld $xr1, $sp, 32 # 32-byte Folded Reload
@@ -244,7 +244,7 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
 ; LA64-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
@@ -262,7 +262,7 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; LA64-NEXT:    move $a0, $fp
 ; LA64-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vextrins.d $vr0, $vr1, 16
 ; LA64-NEXT:    xvld $xr1, $sp, 32 # 32-byte Folded Reload
diff --git a/llvm/test/CodeGen/LoongArch/lasx/scalar-to-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/scalar-to-vector.ll
index bba269279937a..be5d42bdfb975 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/scalar-to-vector.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/scalar-to-vector.ll
@@ -49,7 +49,7 @@ define <4 x i64> @scalar_to_4xi64(i64 %val) {
 define <8 x float> @scalar_to_8xf32(float %val) {
 ; CHECK-LABEL: scalar_to_8xf32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0 def $xr0
 ; CHECK-NEXT:    ret
   %ret = insertelement <8 x float> poison, float %val, i32 0
   ret <8 x float> %ret
@@ -58,7 +58,7 @@ define <8 x float> @scalar_to_8xf32(float %val) {
 define <4 x double> @scalar_to_4xf64(double %val) {
 ; CHECK-LABEL: scalar_to_4xf64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0 def $xr0
 ; CHECK-NEXT:    ret
   %ret = insertelement <4 x double> poison, double %val, i32 0
   ret <4 x double> %ret
diff --git a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
index afc7a39e18dc8..aae23265710ce 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vec_insert_elt.ll
@@ -750,21 +750,25 @@ entry:
 define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
 ; CHECK-64-LABEL: testDoubleImm1:
 ; CHECK-64:       # %bb.0: # %entry
+; CHECK-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-64-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-64-NEXT:    blr
 ;
 ; CHECK-32-LABEL: testDoubleImm1:
 ; CHECK-32:       # %bb.0: # %entry
+; CHECK-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-32-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-32-NEXT:    blr
 ;
 ; CHECK-64-P10-LABEL: testDoubleImm1:
 ; CHECK-64-P10:       # %bb.0: # %entry
+; CHECK-64-P10-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-64-P10-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-64-P10-NEXT:    blr
 ;
 ; CHECK-32-P10-LABEL: testDoubleImm1:
 ; CHECK-32-P10:       # %bb.0: # %entry
+; CHECK-32-P10-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-32-P10-NEXT:    xxpermdi 34, 1, 34, 1
 ; CHECK-32-P10-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index fb55511162a7e..4f965780ab95e 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -1754,7 +1754,11 @@ entry:
 define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ; P9BE-LABEL: fromRegsConvdtoi:
 ; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
+; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpsxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpsxws v3, vs0
@@ -1763,7 +1767,11 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P9LE-LABEL: fromRegsConvdtoi:
 ; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
+; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpsxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpsxws v3, vs0
@@ -1772,6 +1780,10 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P8BE-LABEL: fromRegsConvdtoi:
 ; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpsxws v2, vs0
@@ -1781,6 +1793,10 @@ define <4 x i32> @fromRegsConvdtoi(double %a, double %b, double %c, double %d) {
 ;
 ; P8LE-LABEL: fromRegsConvdtoi:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpsxws v2, vs0
@@ -3240,7 +3256,11 @@ entry:
 define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d) {
 ; P9BE-LABEL: fromRegsConvdtoui:
 ; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9BE-NEXT:    xxmrghd vs0, vs2, vs4
+; P9BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xvcvdpuxws v2, vs0
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs3
 ; P9BE-NEXT:    xvcvdpuxws v3, vs0
@@ -3249,7 +3269,11 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P9LE-LABEL: fromRegsConvdtoui:
 ; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs3, vs1
+; P9LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
 ; P9LE-NEXT:    xvcvdpuxws v2, vs0
 ; P9LE-NEXT:    xxmrghd vs0, vs4, vs2
 ; P9LE-NEXT:    xvcvdpuxws v3, vs0
@@ -3258,6 +3282,10 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P8BE-LABEL: fromRegsConvdtoui:
 ; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P8BE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs2, vs4
 ; P8BE-NEXT:    xxmrghd vs1, vs1, vs3
 ; P8BE-NEXT:    xvcvdpuxws v2, vs0
@@ -3267,6 +3295,10 @@ define <4 x i32> @fromRegsConvdtoui(double %a, double %b, double %c, double %d)
 ;
 ; P8LE-LABEL: fromRegsConvdtoui:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; P8LE-NEXT:    # kill: def $f3 killed $f3 def $vsl3
+; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs3, vs1
 ; P8LE-NEXT:    xxmrghd vs1, vs4, vs2
 ; P8LE-NEXT:    xvcvdpuxws v2, vs0
@@ -4540,24 +4572,32 @@ entry:
 define <2 x i64> @fromRegsConvdtoll(double %a, double %b) {
 ; P9BE-LABEL: fromRegsConvdtoll:
 ; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpsxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvdtoll:
 ; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpsxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvdtoll:
 ; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpsxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvdtoll:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpsxds v2, vs0
 ; P8LE-NEXT:    blr
@@ -5694,24 +5734,32 @@ entry:
 define <2 x i64> @fromRegsConvdtoull(double %a, double %b) {
 ; P9BE-LABEL: fromRegsConvdtoull:
 ; P9BE:       # %bb.0: # %entry
+; P9BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P9BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P9BE-NEXT:    xvcvdpuxds v2, vs0
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: fromRegsConvdtoull:
 ; P9LE:       # %bb.0: # %entry
+; P9LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P9LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P9LE-NEXT:    xvcvdpuxds v2, vs0
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fromRegsConvdtoull:
 ; P8BE:       # %bb.0: # %entry
+; P8BE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8BE-NEXT:    xxmrghd vs0, vs1, vs2
 ; P8BE-NEXT:    xvcvdpuxds v2, vs0
 ; P8BE-NEXT:    blr
 ;
 ; P8LE-LABEL: fromRegsConvdtoull:
 ; P8LE:       # %bb.0: # %entry
+; P8LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; P8LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8LE-NEXT:    xxmrghd vs0, vs2, vs1
 ; P8LE-NEXT:    xvcvdpuxds v2, vs0
 ; P8LE-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index 7f6fdc7f88cd1..b40fbc3e16873 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -562,6 +562,7 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P8-NEXT:    bl dummy
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    xxlxor f0, f0, f0
+; CHECK-P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P8-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P8-NEXT:    xxswapd vs0, vs0
 ; CHECK-P8-NEXT:    stxvd2x vs0, 0, r30
@@ -576,6 +577,7 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P9-NEXT:    bl dummy
 ; CHECK-P9-NEXT:    nop
 ; CHECK-P9-NEXT:    xxlxor f0, f0, f0
+; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P9-NEXT:    stxv vs0, 0(r30)
 ;
@@ -589,6 +591,7 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P9-BE-NEXT:    bl dummy
 ; CHECK-P9-BE-NEXT:    nop
 ; CHECK-P9-BE-NEXT:    xxlxor f0, f0, f0
+; CHECK-P9-BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-BE-NEXT:    xxmrghd vs0, vs0, vs1
 ; CHECK-P9-BE-NEXT:    stxv vs0, 0(r30)
 ;
@@ -615,6 +618,7 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; CHECK-P7-NEXT:    bl dummy
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    xxlxor f0, f0, f0
+; CHECK-P7-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P7-NEXT:    xxmrghd vs0, vs1, vs0
 ; CHECK-P7-NEXT:    xxswapd vs0, vs0
 ; CHECK-P7-NEXT:    stxvd2x vs0, 0, r30
@@ -629,6 +633,7 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; P8-AIX-64-NEXT:    bl .dummy[PR]
 ; P8-AIX-64-NEXT:    nop
 ; P8-AIX-64-NEXT:    xxlxor f0, f0, f0
+; P8-AIX-64-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-AIX-64-NEXT:    xxmrghd vs0, vs0, vs1
 ; P8-AIX-64-NEXT:    stxvd2x vs0, 0, r31
 ;
@@ -642,6 +647,7 @@ define dso_local void @no_crash_elt0_from_RHS(ptr noalias nocapture dereferencea
 ; P8-AIX-32-NEXT:    bl .dummy[PR]
 ; P8-AIX-32-NEXT:    nop
 ; P8-AIX-32-NEXT:    xxlxor f0, f0, f0
+; P8-AIX-32-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-AIX-32-NEXT:    xxmrghd vs0, vs0, vs1
 ; P8-AIX-32-NEXT:    stxvd2x vs0, 0, r31
 test_entry:
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index 04af0947c7a33..a72abf7007e8d 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -6,6 +6,7 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
 ; CHECK-LABEL: fneg_fdiv_splat:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxspltd 0, 1, 0
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
 ; CHECK-NEXT:    xvredp 1, 0
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
index eac4fb6f98bf7..4519cf4101f42 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-round.ll
@@ -229,6 +229,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P8-NEXT:    xscvspdpn f1, vs0
 ; P8-NEXT:    bl nearbyintf
 ; P8-NEXT:    nop
+; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, vs1, v30
 ; P8-NEXT:    xscvspdpn f1, v31
 ; P8-NEXT:    xvcvdpsp v29, vs0
@@ -239,6 +240,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P8-NEXT:    xscvspdpn f1, vs0
 ; P8-NEXT:    bl nearbyintf
 ; P8-NEXT:    nop
+; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, v30, vs1
 ; P8-NEXT:    li r3, 160
 ; P8-NEXT:    xvcvdpsp v2, vs0
@@ -276,6 +278,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P9-NEXT:    xscvspdpn f1, vs0
 ; P9-NEXT:    bl nearbyintf
 ; P9-NEXT:    nop
+; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd vs0, vs1, v30
 ; P9-NEXT:    xscvspdpn f1, v31
 ; P9-NEXT:    xvcvdpsp v29, vs0
@@ -286,6 +289,7 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %vf1, <4 x float> %vf2) strictfp
 ; P9-NEXT:    xscvspdpn f1, vs0
 ; P9-NEXT:    bl nearbyintf
 ; P9-NEXT:    nop
+; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd vs0, v30, vs1
 ; P9-NEXT:    lxv v31, 64(r1) # 16-byte Folded Reload
 ; P9-NEXT:    lxv v30, 48(r1) # 16-byte Folded Reload
@@ -326,6 +330,7 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) stric
 ; P8-NEXT:    bl nearbyint
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 144
+; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd v2, v30, vs1
 ; P8-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
 ; P8-NEXT:    li r3, 128
@@ -354,6 +359,7 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %vf1, <2 x double> %vf2) stric
 ; P9-NEXT:    xxswapd vs1, v31
 ; P9-NEXT:    bl nearbyint
 ; P9-NEXT:    nop
+; P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P9-NEXT:    xxmrghd v2, v30, vs1
 ; P9-NEXT:    lxv v31, 48(r1) # 16-byte Folded Reload
 ; P9-NEXT:    lxv v30, 32(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
index 19b4b1c9cdf95..21cb206ac43bb 100644
--- a/llvm/test/CodeGen/PowerPC/frem.ll
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -70,6 +70,7 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 1, 61
 ; CHECK-NEXT:    xscvspdpn 1, 62
 ; CHECK-NEXT:    xscvspdpn 2, 63
@@ -83,6 +84,7 @@ define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    xscvspdpn 2, 0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 0, 61, 1
 ; CHECK-NEXT:    lxv 63, 80(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 64(1) # 16-byte Folded Reload
@@ -124,6 +126,7 @@ define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-NEXT:    xxswapd 2, 63
 ; CHECK-NEXT:    bl fmod
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd 34, 61, 1
 ; CHECK-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
index 238e200bfc782..3ae0b02f79e27 100644
--- a/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
@@ -41,39 +41,47 @@ define void @test(ptr %p1, ptr %p2) nounwind {
 ; CHECK-NEXT:    xxswapd 61, 63
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 56, 1
 ; CHECK-NEXT:    xxlor 1, 59, 59
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 0, 1
 ; CHECK-NEXT:    xxlor 1, 60, 60
 ; CHECK-NEXT:    xxmrgld 59, 0, 56
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 60, 1
 ; CHECK-NEXT:    xxlor 1, 62, 62
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 0, 1
 ; CHECK-NEXT:    xxlor 1, 61, 61
 ; CHECK-NEXT:    xxmrgld 62, 0, 60
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 61, 1
 ; CHECK-NEXT:    xxlor 1, 63, 63
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 0, 1
 ; CHECK-NEXT:    xxlor 1, 57, 57
 ; CHECK-NEXT:    xxmrgld 63, 0, 61
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 61, 1
 ; CHECK-NEXT:    xxlor 1, 58, 58
 ; CHECK-NEXT:    bl roundeven
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 160
 ; CHECK-NEXT:    stxvd2x 63, 30, 29
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxswapd 0, 1
 ; CHECK-NEXT:    stxvd2x 62, 30, 28
 ; CHECK-NEXT:    stxvd2x 59, 30, 27
diff --git a/llvm/test/CodeGen/PowerPC/half.ll b/llvm/test/CodeGen/PowerPC/half.ll
index 903ea691ae6ba..8b5b7962da33f 100644
--- a/llvm/test/CodeGen/PowerPC/half.ll
+++ b/llvm/test/CodeGen/PowerPC/half.ll
@@ -1365,6 +1365,7 @@ define <4 x float> @test_extend32_vec4(ptr %p) nounwind {
 ; P8-NEXT:    bl __extendhfsf2
 ; P8-NEXT:    nop
 ; P8-NEXT:    li r3, 80
+; P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; P8-NEXT:    xxmrghd vs0, vs61, vs1
 ; P8-NEXT:    xxmrghd vs1, vs63, vs62
 ; P8-NEXT:    ld r30, 96(r1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/ldexp.ll b/llvm/test/CodeGen/PowerPC/ldexp.ll
index 8d7253b5ce8e3..23748bca0b7b2 100644
--- a/llvm/test/CodeGen/PowerPC/ldexp.ll
+++ b/llvm/test/CodeGen/PowerPC/ldexp.ll
@@ -107,6 +107,7 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) nounwind {
 ; CHECK-NEXT:    extsw r4, r3
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd vs0, vs1, v29
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    vextuwrx r3, r3, v31
@@ -123,6 +124,7 @@ define <4 x float> @ldexp_v4f32(<4 x float> %val, <4 x i32> %exp) nounwind {
 ; CHECK-NEXT:    xscvspdpn f1, vs0
 ; CHECK-NEXT:    bl ldexpf
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd vs0, vs1, v29
 ; CHECK-NEXT:    lxv v31, 80(r1) # 16-byte Folded Reload
 ; CHECK-NEXT:    lxv v30, 64(r1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/llvm.modf.ll b/llvm/test/CodeGen/PowerPC/llvm.modf.ll
index 1b137c786cc91..203b3bd15490a 100644
--- a/llvm/test/CodeGen/PowerPC/llvm.modf.ll
+++ b/llvm/test/CodeGen/PowerPC/llvm.modf.ll
@@ -294,6 +294,7 @@ define { <2 x double>, <2 x double> } @test_modf_v2f64(<2 x double> %a) {
 ; CHECK-NEXT:    addi r4, r1, 40
 ; CHECK-NEXT:    bl modf
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd v2, v30, vs1
 ; CHECK-NEXT:    lfd f0, 32(r1)
 ; CHECK-NEXT:    lfd f1, 40(r1)
diff --git a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
index b006c78604648..0364166a1b29e 100644
--- a/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_insert_elt.ll
@@ -928,21 +928,25 @@ entry:
 define <2 x double> @testDoubleImm1(<2 x double> %a, double %b) {
 ; CHECK-LABEL: testDoubleImm1:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxmrghd v2, v2, vs1
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: testDoubleImm1:
 ; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-BE-NEXT:    xxpermdi v2, vs1, v2, 1
 ; CHECK-BE-NEXT:    blr
 ;
 ; CHECK-P9-LABEL: testDoubleImm1:
 ; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-P9-NEXT:    xxpermdi v2, vs1, v2, 1
 ; CHECK-P9-NEXT:    blr
 ;
 ; AIX-P8-LABEL: testDoubleImm1:
 ; AIX-P8:       # %bb.0: # %entry
+; AIX-P8-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; AIX-P8-NEXT:    xxpermdi v2, vs1, v2, 1
 ; AIX-P8-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index 08ca1d153248e..e6307aa2906da 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -107,6 +107,10 @@ entry:
 define <3 x double> @constrained_vector_fdiv_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fdiv_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsdivdp 3, 3, 6
@@ -116,6 +120,10 @@ define <3 x double> @constrained_vector_fdiv_v3f64(<3 x double> %x, <3 x double>
 ;
 ; PC64LE9-LABEL: constrained_vector_fdiv_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsdivdp 3, 3, 6
@@ -209,6 +217,7 @@ define <2 x double> @constrained_vector_frem_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -239,6 +248,7 @@ define <2 x double> @constrained_vector_frem_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE9-NEXT:    xxswapd 2, 63
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -390,6 +400,7 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -431,6 +442,7 @@ define <3 x double> @constrained_vector_frem_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -486,6 +498,7 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    xxswapd 2, 62
 ; PC64LE-NEXT:    bl fmod
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -498,6 +511,7 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -536,6 +550,7 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xxswapd 2, 62
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -546,6 +561,7 @@ define <4 x double> @constrained_vector_frem_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xxswapd 2, 63
 ; PC64LE9-NEXT:    bl fmod
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
@@ -670,6 +686,10 @@ entry:
 define <3 x double> @constrained_vector_fmul_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fmul_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsmuldp 3, 3, 6
@@ -679,6 +699,10 @@ define <3 x double> @constrained_vector_fmul_v3f64(<3 x double> %x, <3 x double>
 ;
 ; PC64LE9-LABEL: constrained_vector_fmul_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsmuldp 3, 3, 6
@@ -820,6 +844,10 @@ entry:
 define <3 x double> @constrained_vector_fadd_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fadd_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xsadddp 3, 3, 6
@@ -829,6 +857,10 @@ define <3 x double> @constrained_vector_fadd_v3f64(<3 x double> %x, <3 x double>
 ;
 ; PC64LE9-LABEL: constrained_vector_fadd_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xsadddp 3, 3, 6
@@ -970,6 +1002,10 @@ entry:
 define <3 x double> @constrained_vector_fsub_v3f64(<3 x double> %x, <3 x double> %y) #0 {
 ; PC64LE-LABEL: constrained_vector_fsub_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    xssubdp 3, 3, 6
@@ -979,6 +1015,10 @@ define <3 x double> @constrained_vector_fsub_v3f64(<3 x double> %x, <3 x double>
 ;
 ; PC64LE9-LABEL: constrained_vector_fsub_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    xssubdp 3, 3, 6
@@ -1105,6 +1145,8 @@ entry:
 define <3 x double> @constrained_vector_sqrt_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_sqrt_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xssqrtdp 3, 3
 ; PC64LE-NEXT:    xvsqrtdp 2, 0
@@ -1113,6 +1155,8 @@ define <3 x double> @constrained_vector_sqrt_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_sqrt_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xssqrtdp 3, 3
 ; PC64LE9-NEXT:    xvsqrtdp 2, 0
@@ -1203,6 +1247,7 @@ define <2 x double> @constrained_vector_pow_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -1233,6 +1278,7 @@ define <2 x double> @constrained_vector_pow_v2f64(<2 x double> %x, <2 x double>
 ; PC64LE9-NEXT:    xxswapd 2, 63
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -1384,6 +1430,7 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -1425,6 +1472,7 @@ define <3 x double> @constrained_vector_pow_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -1480,6 +1528,7 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    xxswapd 2, 62
 ; PC64LE-NEXT:    bl pow
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -1492,6 +1541,7 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -1530,6 +1580,7 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xxswapd 2, 62
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -1540,6 +1591,7 @@ define <4 x double> @constrained_vector_pow_v4f64(<4 x double> %x, <4 x double>
 ; PC64LE9-NEXT:    xxswapd 2, 63
 ; PC64LE9-NEXT:    bl pow
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
@@ -1618,6 +1670,7 @@ define <2 x double> @constrained_vector_powi_v2f64(<2 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    ld 30, 80(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
@@ -1647,6 +1700,7 @@ define <2 x double> @constrained_vector_powi_v2f64(<2 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -1790,6 +1844,7 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    mr 4, 30
@@ -1828,6 +1883,7 @@ define <3 x double> @constrained_vector_powi_v3f64(<3 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    mr 4, 30
@@ -1878,6 +1934,7 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    mr 4, 30
 ; PC64LE-NEXT:    bl __powidf2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    mr 4, 30
@@ -1890,6 +1947,7 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    ld 30, 96(1) # 8-byte Folded Reload
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
@@ -1923,6 +1981,7 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    mr 4, 30
@@ -1933,6 +1992,7 @@ define <4 x double> @constrained_vector_powi_v4f64(<4 x double> %x, i32 %y) #0 {
 ; PC64LE9-NEXT:    mr 4, 30
 ; PC64LE9-NEXT:    bl __powidf2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2003,6 +2063,7 @@ define <2 x double> @constrained_vector_sin_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2027,6 +2088,7 @@ define <2 x double> @constrained_vector_sin_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2149,6 +2211,7 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl sin
@@ -2181,6 +2244,7 @@ define <3 x double> @constrained_vector_sin_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl sin
@@ -2224,6 +2288,7 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl sin
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl sin
@@ -2234,6 +2299,7 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2262,6 +2328,7 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl sin
@@ -2270,6 +2337,7 @@ define <4 x double> @constrained_vector_sin_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl sin
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2338,6 +2406,7 @@ define <2 x double> @constrained_vector_cos_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2362,6 +2431,7 @@ define <2 x double> @constrained_vector_cos_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2484,6 +2554,7 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl cos
@@ -2516,6 +2587,7 @@ define <3 x double> @constrained_vector_cos_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl cos
@@ -2559,6 +2631,7 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl cos
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl cos
@@ -2569,6 +2642,7 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2597,6 +2671,7 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl cos
@@ -2605,6 +2680,7 @@ define <4 x double> @constrained_vector_cos_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl cos
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -2673,6 +2749,7 @@ define <2 x double> @constrained_vector_exp_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -2697,6 +2774,7 @@ define <2 x double> @constrained_vector_exp_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -2819,6 +2897,7 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl exp
@@ -2851,6 +2930,7 @@ define <3 x double> @constrained_vector_exp_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl exp
@@ -2894,6 +2974,7 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl exp
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl exp
@@ -2904,6 +2985,7 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -2932,6 +3014,7 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl exp
@@ -2940,6 +3023,7 @@ define <4 x double> @constrained_vector_exp_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl exp
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3008,6 +3092,7 @@ define <2 x double> @constrained_vector_exp2_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3032,6 +3117,7 @@ define <2 x double> @constrained_vector_exp2_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3154,6 +3240,7 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl exp2
@@ -3186,6 +3273,7 @@ define <3 x double> @constrained_vector_exp2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl exp2
@@ -3229,6 +3317,7 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl exp2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl exp2
@@ -3239,6 +3328,7 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3267,6 +3357,7 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl exp2
@@ -3275,6 +3366,7 @@ define <4 x double> @constrained_vector_exp2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl exp2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3343,6 +3435,7 @@ define <2 x double> @constrained_vector_log_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3367,6 +3460,7 @@ define <2 x double> @constrained_vector_log_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3489,6 +3583,7 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log
@@ -3521,6 +3616,7 @@ define <3 x double> @constrained_vector_log_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log
@@ -3564,6 +3660,7 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl log
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log
@@ -3574,6 +3671,7 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3602,6 +3700,7 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log
@@ -3610,6 +3709,7 @@ define <4 x double> @constrained_vector_log_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl log
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -3678,6 +3778,7 @@ define <2 x double> @constrained_vector_log10_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -3702,6 +3803,7 @@ define <2 x double> @constrained_vector_log10_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -3824,6 +3926,7 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log10
@@ -3856,6 +3959,7 @@ define <3 x double> @constrained_vector_log10_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log10
@@ -3899,6 +4003,7 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl log10
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log10
@@ -3909,6 +4014,7 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -3937,6 +4043,7 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log10
@@ -3945,6 +4052,7 @@ define <4 x double> @constrained_vector_log10_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl log10
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4013,6 +4121,7 @@ define <2 x double> @constrained_vector_log2_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -4037,6 +4146,7 @@ define <2 x double> @constrained_vector_log2_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4159,6 +4269,7 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl log2
@@ -4191,6 +4302,7 @@ define <3 x double> @constrained_vector_log2_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl log2
@@ -4234,6 +4346,7 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl log2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl log2
@@ -4244,6 +4357,7 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4272,6 +4386,7 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl log2
@@ -4280,6 +4395,7 @@ define <4 x double> @constrained_vector_log2_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl log2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4387,6 +4503,8 @@ define <3 x float> @constrained_vector_rint_v3f32(<3 x float> %x) #0 {
 define <3 x double> @constrained_vector_rint_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_rint_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpic 3, 3
 ; PC64LE-NEXT:    xvrdpic 2, 0
@@ -4395,6 +4513,8 @@ define <3 x double> @constrained_vector_rint_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_rint_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpic 3, 3
 ; PC64LE9-NEXT:    xvrdpic 2, 0
@@ -4479,6 +4599,7 @@ define <2 x double> @constrained_vector_nearbyint_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -4503,6 +4624,7 @@ define <2 x double> @constrained_vector_nearbyint_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -4625,6 +4747,7 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl nearbyint
@@ -4657,6 +4780,7 @@ define <3 x double> @constrained_vector_nearby_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl nearbyint
@@ -4700,6 +4824,7 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl nearbyint
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl nearbyint
@@ -4710,6 +4835,7 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -4738,6 +4864,7 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl nearbyint
@@ -4746,6 +4873,7 @@ define <4 x double> @constrained_vector_nearbyint_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl nearbyint
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -4927,6 +5055,10 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    mflr 0
 ; PC64LE-NEXT:    stdu 1, -64(1)
 ; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    std 0, 80(1)
@@ -4950,6 +5082,10 @@ define <3 x double> @constrained_vector_max_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9:       # %bb.0: # %entry
 ; PC64LE9-NEXT:    mflr 0
 ; PC64LE9-NEXT:    stdu 1, -48(1)
+; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    std 0, 64(1)
@@ -5159,6 +5295,10 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE-NEXT:    mflr 0
 ; PC64LE-NEXT:    stdu 1, -64(1)
 ; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE-NEXT:    std 0, 80(1)
@@ -5182,6 +5322,10 @@ define <3 x double> @constrained_vector_min_v3f64(<3 x double> %x, <3 x double>
 ; PC64LE9:       # %bb.0: # %entry
 ; PC64LE9-NEXT:    mflr 0
 ; PC64LE9-NEXT:    stdu 1, -48(1)
+; PC64LE9-NEXT:    # kill: def $f5 killed $f5 def $vsl5
+; PC64LE9-NEXT:    # kill: def $f4 killed $f4 def $vsl4
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 5, 4
 ; PC64LE9-NEXT:    xxmrghd 1, 2, 1
 ; PC64LE9-NEXT:    std 0, 64(1)
@@ -6520,6 +6664,8 @@ entry:
 define <3 x double> @constrained_vector_ceil_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_ceil_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpip 3, 3
 ; PC64LE-NEXT:    xvrdpip 2, 0
@@ -6528,6 +6674,8 @@ define <3 x double> @constrained_vector_ceil_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_ceil_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpip 3, 3
 ; PC64LE9-NEXT:    xvrdpip 2, 0
@@ -6628,6 +6776,8 @@ entry:
 define <3 x double> @constrained_vector_floor_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_floor_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpim 3, 3
 ; PC64LE-NEXT:    xvrdpim 2, 0
@@ -6636,6 +6786,8 @@ define <3 x double> @constrained_vector_floor_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_floor_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpim 3, 3
 ; PC64LE9-NEXT:    xvrdpim 2, 0
@@ -6736,6 +6888,8 @@ entry:
 define <3 x double> @constrained_vector_round_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_round_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpi 3, 3
 ; PC64LE-NEXT:    xvrdpi 2, 0
@@ -6744,6 +6898,8 @@ define <3 x double> @constrained_vector_round_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_round_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpi 3, 3
 ; PC64LE9-NEXT:    xvrdpi 2, 0
@@ -6843,6 +6999,8 @@ entry:
 define <3 x double> @constrained_vector_trunc_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_trunc_v3f64:
 ; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE-NEXT:    xsrdpiz 3, 3
 ; PC64LE-NEXT:    xvrdpiz 2, 0
@@ -6851,6 +7009,8 @@ define <3 x double> @constrained_vector_trunc_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_trunc_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    # kill: def $f2 killed $f2 def $vsl2
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 0, 2, 1
 ; PC64LE9-NEXT:    xsrdpiz 3, 3
 ; PC64LE9-NEXT:    xvrdpiz 2, 0
@@ -8049,6 +8209,7 @@ define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 48
@@ -8073,6 +8234,7 @@ define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 62, 1
 ; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
@@ -8195,6 +8357,7 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE-NEXT:    fmr 1, 30
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 31
 ; PC64LE-NEXT:    bl tan
@@ -8227,6 +8390,7 @@ define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
 ; PC64LE9-NEXT:    fmr 1, 30
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 31
 ; PC64LE9-NEXT:    bl tan
@@ -8270,6 +8434,7 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    xxswapd 1, 62
 ; PC64LE-NEXT:    bl tan
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE-NEXT:    xxlor 1, 63, 63
 ; PC64LE-NEXT:    bl tan
@@ -8280,6 +8445,7 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -8308,6 +8474,7 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 62
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 61, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
 ; PC64LE9-NEXT:    bl tan
@@ -8316,6 +8483,7 @@ define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
 ; PC64LE9-NEXT:    xxswapd 1, 63
 ; PC64LE9-NEXT:    bl tan
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 61, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
@@ -8390,6 +8558,7 @@ define <2 x double> @constrained_vector_atan2_v2f64(<2 x double> %x, <2 x double
 ; PC64LE-NEXT:    bl atan2
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 64
@@ -8420,6 +8589,7 @@ define <2 x double> @constrained_vector_atan2_v2f64(<2 x double> %x, <2 x double
 ; PC64LE9-NEXT:    xxswapd 2, 63
 ; PC64LE9-NEXT:    bl atan2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 34, 61, 1
 ; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
 ; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
@@ -8571,6 +8741,7 @@ define <3 x double> @constrained_vector_atan2_v3f64(<3 x double> %x, <3 x double
 ; PC64LE-NEXT:    fmr 2, 30
 ; PC64LE-NEXT:    bl atan2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE-NEXT:    fmr 1, 29
 ; PC64LE-NEXT:    fmr 2, 31
@@ -8612,6 +8783,7 @@ define <3 x double> @constrained_vector_atan2_v3f64(<3 x double> %x, <3 x double
 ; PC64LE9-NEXT:    fmr 2, 30
 ; PC64LE9-NEXT:    bl atan2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 63, 1, 63
 ; PC64LE9-NEXT:    fmr 1, 29
 ; PC64LE9-NEXT:    fmr 2, 31
@@ -8667,6 +8839,7 @@ define <4 x double> @constrained_vector_atan2_v4f64(<4 x double> %x, <4 x double
 ; PC64LE-NEXT:    xxswapd 2, 62
 ; PC64LE-NEXT:    bl atan2
 ; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE-NEXT:    xxlor 1, 61, 61
 ; PC64LE-NEXT:    xxlor 2, 63, 63
@@ -8679,6 +8852,7 @@ define <4 x double> @constrained_vector_atan2_v4f64(<4 x double> %x, <4 x double
 ; PC64LE-NEXT:    nop
 ; PC64LE-NEXT:    li 3, 112
 ; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
 ; PC64LE-NEXT:    li 3, 96
@@ -8717,6 +8891,7 @@ define <4 x double> @constrained_vector_atan2_v4f64(<4 x double> %x, <4 x double
 ; PC64LE9-NEXT:    xxswapd 2, 62
 ; PC64LE9-NEXT:    bl atan2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 62, 59, 1
 ; PC64LE9-NEXT:    xscpsgndp 1, 61, 61
 ; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
@@ -8727,6 +8902,7 @@ define <4 x double> @constrained_vector_atan2_v4f64(<4 x double> %x, <4 x double
 ; PC64LE9-NEXT:    xxswapd 2, 63
 ; PC64LE9-NEXT:    bl atan2
 ; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; PC64LE9-NEXT:    xxmrghd 35, 60, 1
 ; PC64LE9-NEXT:    vmr 2, 30
 ; PC64LE9-NEXT:    lxv 63, 96(1) # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
new file mode 100644
index 0000000000000..ea7454faad218
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s
+
+%struct.wibble = type { %struct.wombat }
+%struct.wombat = type { %struct.ham, [3 x i8] }
+%struct.ham = type { %struct.zot }
+%struct.zot = type { %struct.blam }
+%struct.blam = type { %struct.ham.0 }
+%struct.ham.0 = type { %struct.bar }
+%struct.bar = type { %struct.bar.1 }
+%struct.bar.1 = type { %struct.baz, i8 }
+%struct.baz = type { %struct.snork }
+%struct.snork = type <{ %struct.spam, i8, [3 x i8] }>
+%struct.spam = type { %struct.snork.2, %struct.snork.2 }
+%struct.snork.2 = type { i32 }
+%struct.snork.3 = type { %struct.baz, i8, [3 x i8] }
+
+define void @foo(ptr %arg, ptr %arg1, i40 %arg2, ptr %arg3, i32 %arg4) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    movl %r8d, %r14d
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %r13
+; CHECK-NEXT:    movq %rdi, %r15
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    # implicit-def: $r12
+; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movq %r15, %r13
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    testq %rbx, %rbx
+; CHECK-NEXT:    sete %r15b
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    callq _Znwm@PLT
+; CHECK-NEXT:    shll $4, %r15d
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; CHECK-NEXT:    movq %r12, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
+; CHECK-NEXT:    movb %cl, 12(%rax)
+; CHECK-NEXT:    movl %r12d, 8(%rax)
+; CHECK-NEXT:    movq %r15, %rbx
+; CHECK-NEXT:    movq %r13, %r15
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-NEXT:    decl %r14d
+; CHECK-NEXT:    je .LBB0_8
+; CHECK-NEXT:  .LBB0_3: # %bb7
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq widget@PLT
+; CHECK-NEXT:    cmpb $-5, (%r13)
+; CHECK-NEXT:    jae .LBB0_5
+; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movl %r12d, %r12d
+; CHECK-NEXT:    cmpq %r15, %rbx
+; CHECK-NEXT:    jbe .LBB0_1
+; CHECK-NEXT:    jmp .LBB0_7
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_5: # %bb12
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movq 0, %rax
+; CHECK-NEXT:    movq 8, %rax
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; CHECK-NEXT:    cmpq %r15, %rbx
+; CHECK-NEXT:    jbe .LBB0_1
+; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    decl %r14d
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_8: # %bb21
+; CHECK-NEXT:    cmpb $0, 12(%rax)
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  # %bb.9: # %bb26
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_10: # %bb25
+; CHECK-NEXT:    .cfi_def_cfa %rbp, 16
+; CHECK-NEXT:    movq %r15, %rdi
+; CHECK-NEXT:    callq pluto@PLT
+bb:
+  br label %bb7
+
+bb5:                                              ; preds = %bb17, %bb14
+  %phi = phi ptr [ %call19, %bb17 ], [ null, %bb14 ]
+  %phi6 = phi ptr [ %getelementptr, %bb17 ], [ null, %bb14 ]
+  %add = add i32 %phi9, 1
+  %icmp = icmp eq i32 %phi9, %arg4
+  br i1 %icmp, label %bb21, label %bb7
+
+bb7:                                              ; preds = %bb5, %bb
+  %phi8 = phi ptr [ null, %bb ], [ %phi6, %bb5 ]
+  %phi9 = phi i32 [ 0, %bb ], [ %add, %bb5 ]
+  %phi10 = phi i40 [ poison, %bb ], [ %phi15, %bb5 ]
+  %call = call ptr @widget()
+  %load = load i8, ptr %arg1, align 8
+  %icmp11 = icmp ult i8 %load, -5
+  %and = and i40 %phi10, 4294967295
+  br i1 %icmp11, label %bb14, label %bb12
+
+bb12:                                             ; preds = %bb7
+  %load13 = load volatile { i64, i64 }, ptr null, align 4294967296
+  br label %bb14
+
+bb14:                                             ; preds = %bb12, %bb7
+  %phi15 = phi i40 [ %and, %bb7 ], [ %arg2, %bb12 ]
+  %icmp16 = icmp ugt ptr %phi8, %arg
+  br i1 %icmp16, label %bb5, label %bb17
+
+bb17:                                             ; preds = %bb14
+  %icmp18 = icmp eq ptr %phi8, null
+  %zext = zext i1 %icmp18 to i64
+  %call19 = call ptr @_Znwm(i64 0)
+  %getelementptr = getelementptr %struct.wibble, ptr %arg3, i64 %zext
+  %getelementptr20 = getelementptr i8, ptr %call19, i64 8
+  store i40 %phi15, ptr %getelementptr20, align 4
+  br label %bb5
+
+bb21:                                             ; preds = %bb5
+  %getelementptr22 = getelementptr %struct.snork.3, ptr %phi, i64 0, i32 1
+  %load23 = load i8, ptr %getelementptr22, align 4
+  %icmp24 = icmp eq i8 %load23, 0
+  br i1 %icmp24, label %bb26, label %bb25
+
+bb25:                                             ; preds = %bb21
+  call void @pluto(ptr %arg)
+  unreachable
+
+bb26:                                             ; preds = %bb21
+  ret void
+}
+
+define void @eggs(ptr %arg, ptr %arg1) {
+; CHECK-LABEL: eggs:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+bb:
+  call void @foo(ptr %arg1, ptr %arg, i40 0, ptr null, i32 0)
+  ret void
+}
+
+declare ptr @widget()
+
+declare void @pluto(ptr)
+
+declare ptr @_Znwm(i64)
+
+attributes #0 = { noinline "frame-pointer"="all" }
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
index 8241a1757af52..0bc208dc709d7 100644
--- a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=register-coalescer -o - %s | FileCheck %s
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=register-coalescer -o - %s | FileCheck %s --match-full-lines
 ---
 name:  rematerialize_subreg_to_reg_added_impdef_1
 tracksRegLiveness: true
@@ -9,7 +9,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
   ; CHECK-NEXT:   liveins: $edi
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
   ; CHECK-NEXT:   JCC_1 %bb.2, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -28,7 +28,7 @@ body:             |
   ; CHECK-NEXT:   JCC_1 %bb.5, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al
+  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al, implicit-def $al
   ; CHECK-NEXT:   RET 0, killed undef $al
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
diff --git a/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
new file mode 100644
index 0000000000000..2e6395f065e25
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -enable-subreg-liveness -verify-coalescing -o - %s | FileCheck %s
+
+---
+name: requires_new_subrange_coalesce_subreg_to_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: requires_new_subrange_coalesce_subreg_to_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = COPY $eax
+  ; CHECK-NEXT:   %b:gr32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
+  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit undef $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   %c.sub_32bit:gr64 = COPY %a
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   %c.sub_32bit:gr64 = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
+  ; CHECK-NEXT:   RET 0, implicit %c
+  bb.0:
+    liveins: $eax
+    %init_eax:gr32 = COPY $eax
+    %a:gr64 = SUBREG_TO_REG 0, %init_eax, %subreg.sub_32bit
+    %b:gr32 = IMPLICIT_DEF
+    %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
+    JCC_1 %bb.2, 4, implicit undef $eflags
+
+  bb.1:
+    %imm0:gr32 = MOV32r0 implicit-def dead $eflags
+    %a = SUBREG_TO_REG 0, %imm0, %subreg.sub_32bit
+    %c.sub_32bit = COPY %a
+
+  bb.2:
+    %c.sub_32bit = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
+    RET 0, implicit %c
+
+...
diff --git a/llvm/test/CodeGen/X86/pr76416.ll b/llvm/test/CodeGen/X86/pr76416.ll
new file mode 100644
index 0000000000000..68e9ef9c87f6e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr76416.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;
+; Reproducer from https://github.com/llvm/llvm-project/issues/76416
+;
+
+@load_p = external global ptr, align 8
+@load_data = external global i8, align 1
+
+define dso_local void @pr76416() {
+; CHECK-LABEL: pr76416:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jg .LBB0_3
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    incl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jle .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %for.end
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq load_p@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    movq load_data@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_4: # %for.cond1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movq (%rax), %rdx
+; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    movzbl (%rdx,%rsi), %edx
+; CHECK-NEXT:    movb %dl, (%rcx)
+; CHECK-NEXT:    leal 1(%rsi), %edx
+; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jmp .LBB0_4
+entry:
+  %alloca = alloca i32, align 4
+  store i32 0, ptr %alloca, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %load.from.alloca.0 = load i32, ptr %alloca, align 4
+  %cmp = icmp slt i32 %load.from.alloca.0, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  call void asm sideeffect "", "{ax},~{dirflag},~{fpsr},~{flags}"(i8 0) nounwind
+  %load.from.alloca.1 = load i32, ptr %alloca, align 4
+  %inc = add nsw i32 %load.from.alloca.1, 1
+  store i32 %inc, ptr %alloca, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, ptr %alloca, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.end
+  call void asm sideeffect "", "N{dx},~{dirflag},~{fpsr},~{flags}"(i32 poison) nounwind
+  %load.from.load_p = load ptr, ptr @load_p, align 8
+  %regs = getelementptr inbounds { [4 x i8] }, ptr %load.from.load_p, i32 0, i32 0
+  %load.from.alloca.2 = load i32, ptr %alloca, align 4
+  %idxprom = sext i32 %load.from.alloca.2 to i64
+  %arrayidx = getelementptr inbounds [4 x i8], ptr %regs, i64 0, i64 %idxprom
+  %load.with.gep.ptr = load i8, ptr %arrayidx, align 1
+  store i8 %load.with.gep.ptr, ptr @load_data, align 1
+  %load.from.alloca.3 = load i32, ptr %alloca, align 4
+  %inc2 = add nsw i32 %load.from.alloca.3, 1
+  store i32 %inc2, ptr %alloca, align 4
+  br label %for.cond1
+}
diff --git a/llvm/test/CodeGen/X86/subreg-fail.mir b/llvm/test/CodeGen/X86/subreg-fail.mir
index c8146f099b814..dc690719e8581 100644
--- a/llvm/test/CodeGen/X86/subreg-fail.mir
+++ b/llvm/test/CodeGen/X86/subreg-fail.mir
@@ -14,8 +14,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: test1
-    ; CHECK: undef [[MOV32rm:%[0-9]+]].sub_32bit:gr64_nosp = MOV32rm undef %1:gr64, 1, $noreg, 0, $noreg :: (volatile load (s32) from `ptr undef`)
-    ; CHECK-NEXT: undef [[MOV32rm1:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32rm undef %4:gr64, 1, $noreg, 0, $noreg :: (volatile load (s32) from `ptr undef`)
+    ; CHECK: undef [[MOV32rm:%[0-9]+]].sub_32bit:gr64_nosp = MOV32rm undef %1:gr64, 1, $noreg, 0, $noreg, implicit-def [[MOV32rm]] :: (volatile load (s32) from `ptr undef`)
+    ; CHECK-NEXT: undef [[MOV32rm1:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32rm undef %4:gr64, 1, $noreg, 0, $noreg, implicit-def [[MOV32rm1]] :: (volatile load (s32) from `ptr undef`)
     ; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr64_with_sub_8bit = SHL64ri [[MOV32rm1]], 32, implicit-def dead $eflags
     ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64_with_sub_8bit = LEA64r [[MOV32rm1]], 1, [[MOV32rm]], 256, $noreg
     ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64_with_sub_8bit = SHR64ri [[LEA64r]], 8, implicit-def dead $eflags
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
new file mode 100644
index 0000000000000..ff946b76e8f61
--- /dev/null
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
@@ -0,0 +1,451 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -o - %s | FileCheck %s --match-full-lines
+
+# We cannot lose the liveness of the high subregister of %1 when
+# coalesced with %0, so introduce an implicit-def of the super
+# register on the MOV.
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r [[MOV32r0_]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: subreg_to_reg_folds_to_undef
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $rax
+
+    ; CHECK-LABEL: name: subreg_to_reg_folds_to_undef
+    ; CHECK: liveins: $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY $rax
+    ; CHECK-NEXT: undef [[MOV32rr:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32rr [[COPY]].sub_32bit, implicit-def [[MOV32rr]]
+    ; CHECK-NEXT: RET 0, implicit [[MOV32rr]]
+    %0:gr64 = COPY killed $rax
+    %1:gr32 = COPY killed %0.sub_32bit
+    %2:gr32 = MOV32rr killed %1
+    %3:gr64 = SUBREG_TO_REG 0, killed %2, %subreg.sub_32bit
+    %4:gr64 = COPY killed %3
+    RET 0, implicit %4
+
+...
+
+---
+name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r [[MOV32r0_]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r [[MOV32r0_]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %0
+    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def undef [[MOV32r0_]].sub_8bit, implicit-def [[MOV32r0_]]
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit [[MOV32r0_]]
+    ; CHECK-NEXT: CALL64r [[MOV32r0_]], csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags, implicit-def undef %0.sub_8bit
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    INLINEASM &"", 0, implicit %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+
+# Reduced realistic case which was asserting after introducing new implicit-defs
+---
+name: coalesce_needs_implicit_defs
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_needs_implicit_defs
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $rdi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
+  ; CHECK-NEXT:   undef [[MOV32r0_1:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[MOV32r0_2:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   TEST64rr [[MOV32r0_1]], [[MOV32r0_1]], implicit-def $eflags
+  ; CHECK-NEXT:   [[MOV32r0_2:%[0-9]+]].sub_8bit:gr64_with_sub_8bit = SETCCr 4, implicit killed $eflags
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+  ; CHECK-NEXT:   CALL64r [[MOV32r0_]], csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[MOV32r0_2:%[0-9]+]]:gr64_with_sub_8bit = SHL64ri [[MOV32r0_2]], 4, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[MOV32r0_2:%[0-9]+]]:gr64_with_sub_8bit = ADD64rr [[MOV32r0_2]], [[COPY]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[MOV32r0_1:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32r0_2]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  bb.0:
+    liveins: $rdi
+
+    %0:gr64 = COPY killed $rdi
+    %1:gr32 = MOV32r0 implicit-def dead $eflags
+    %2:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
+    %3:gr64 = COPY killed %2
+
+  bb.1:
+    %4:gr64 = COPY killed %3
+    %5:gr32 = MOV32r0 implicit-def dead $eflags
+    TEST64rr killed %4, %4, implicit-def $eflags
+    %6:gr8 = SETCCr 4, implicit killed $eflags
+    %7:gr32 = COPY killed %5
+    %7.sub_8bit:gr32 = COPY killed %6
+    %8:gr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32bit
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %9:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
+    $rdi = COPY %9
+    CALL64r killed %9, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %10:gr64 = COPY killed %8
+    %10:gr64 = SHL64ri %10, 4, implicit-def dead $eflags
+    %11:gr64 = COPY killed %10
+    %11:gr64 = ADD64rr %11, %0, implicit-def dead $eflags
+    %3:gr64 = COPY killed %11
+    JMP_1 %bb.1
+
+...
+
+# Make sure to add the 'undef' flag to the result register %2,
+# because the top 32bits are not defined.
+---
+name: coalesce_add_implicitdef_and_undef
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_add_implicitdef_and_undef
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $eflags, $edx
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = COPY $edx
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = ADD32ri [[COPY]].sub_32bit, -34, implicit-def $eflags, implicit-def [[COPY]]
+  ; CHECK-NEXT:   FAKE_USE [[COPY]]
+  ; CHECK-NEXT:   RET 0
+  bb.0:
+    liveins: $eflags, $edx
+    %0:gr32 = COPY $edx
+    JMP_1 %bb.1
+
+  bb.1:
+    %1:gr32 = COPY %0
+    %1:gr32 = ADD32ri %1, -34, implicit-def $eflags
+    %2:gr64_with_sub_8bit = SUBREG_TO_REG 0, killed %1, %subreg.sub_32bit
+    FAKE_USE %2
+    RET 0
+...
+
+# We can't mark the destination register as 'undef' or add implicit-def
+# because the top 24 bits of %0:gr32 are retained by the SUBREG_TO_REG.
+#
+# For example, if this were to result in:
+#
+#     undef %2.sub_32bit:gr64_with_sub_8bit = COPY $edx
+#     %1:gr8 = SETCCr 4, implicit $eflags
+#     JMP_1 %bb.1
+#
+#   bb.1:
+#     undef %2.sub_8bit:gr64_with_sub_8bit = COPY %1, implicit-def %2
+#
+# Then this says that the top 56 bits of %2 are undef. That's not correct
+# because only the top 32 bits are undef.
+---
+name: coalesce_dont_add_implicitdef_or_undef
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_dont_add_implicitdef_or_undef
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $eflags, $edx
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = COPY $edx
+  ; CHECK-NEXT:   [[SETCCr:%[0-9]+]]:gr8 = SETCCr 4, implicit $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub_8bit:gr64_with_sub_8bit = COPY [[SETCCr]]
+  ; CHECK-NEXT:   FAKE_USE [[COPY]]
+  ; CHECK-NEXT:   RET 0
+  bb.0:
+    liveins: $eflags, $edx
+    %0:gr32 = COPY $edx
+    %1:gr8 = SETCCr 4, implicit killed $eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    %0.sub_8bit:gr32 = COPY %1
+    %2:gr64_with_sub_8bit = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    FAKE_USE %2
+    RET 0
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    $rdi = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: $eax = MOV32r0 implicit-def dead $eflags
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, $eax, %subreg.sub_32bit
+    ; CHECK-NEXT: $rdi = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: CALL64r [[SUBREG_TO_REG]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $eax = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed $eax, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+# Coalesced instruction is a copy with other implicit operands
+---
+name: coalesce_copy_into_subreg_to_reg64
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: coalesce_copy_into_subreg_to_reg64
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = COPY $eax, implicit-def dead $eflags, implicit-def [[COPY]]
+    ; CHECK-NEXT: $rdi = COPY [[COPY]]
+    ; CHECK-NEXT: CALL64r [[COPY]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = COPY $eax, implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit-def undef [[MOV32r0_]].sub_32bit, implicit [[MOV32r0_]].sub_32bit, implicit-def [[MOV32r0_]]
+    ; CHECK-NEXT: $rdi = COPY [[MOV32r0_]]
+    ; CHECK-NEXT: CALL64r [[MOV32r0_]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    INLINEASM &"", 0, implicit-def %0, implicit %0
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $rdi = COPY %1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   RET 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  bb.0:
+    INLINEASM &"", 0, implicit-def %0:gr32
+    JCC_1 %bb.1, 4, implicit undef $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+  bb.2:
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
+frameInfo:
+  adjustsStack: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $rdi = COPY %1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  bb.0:
+
+    INLINEASM &"", 0, implicit-def %0:gr32
+    JCC_1 %bb.1, 4, implicit undef $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.1
+
+  bb.2:
+
+...
diff --git a/llvm/test/Transforms/Coroutines/declare-value.ll b/llvm/test/Transforms/Coroutines/declare-value.ll
new file mode 100644
index 0000000000000..94049c28169b9
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/declare-value.ll
@@ -0,0 +1,68 @@
+;RUN: opt -mtriple='arm64-' %s -S -passes='module(coro-early),cgscc(coro-split,simplifycfg)' -o - | FileCheck %s
+
+; CHECK:  %.debug = alloca double, align 8
+; CHECK-NEXT:    #dbg_declare(ptr %{{.*}}, !{{[0-9]+}}, !DIExpression(DW_OP_deref), !{{[0-9]+}})
+; CHECK-NEXT:  store double %{{[0-9]+}}, ptr %{{.*}}, align 8
+; CHECK-NEXT:    #dbg_declare(ptr %arg, !{{[0-9]+}}, !DIExpression(DW_OP_plus_uconst, 24), !{{[0-9]+}})
+
+; ModuleID = '/Users/srastogi/Development/llvm-project-2/llvm/test/Transforms/Coroutines/declare-value.ll'
+source_filename = "/Users/srastogi/Development/llvm-project-2/llvm/test/Transforms/Coroutines/declare-value.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-unknown"
+
+@coroutineATu = global <{ i32, i32 }> <{ i32 trunc (i64 sub (i64 ptrtoint (ptr @coroutineA to i64), i64 ptrtoint (ptr @coroutineATu to i64)) to i32), i32 16 }>, align 8
+
+; Function Attrs: presplitcoroutine
+define swifttailcc void @coroutineA(ptr swiftasync %arg, double %0) #0 !dbg !1 {
+  %var_with_dbg_value = alloca ptr, align 8
+  %var_with_dbg_declare = alloca ptr, align 8
+    #dbg_declare(ptr %var_with_dbg_declare, !5, !DIExpression(), !7)
+    #dbg_declare_value(double %0, !5, !DIExpression(), !7)
+  %i2 = call token @llvm.coro.id.async(i32 16, i32 16, i32 0, ptr nonnull @coroutineATu)
+  %i3 = call ptr @llvm.coro.begin(token %i2, ptr null)
+  %i7 = call ptr @llvm.coro.async.resume(), !dbg !7
+  %i10 = call { ptr } (i32, ptr, ptr, ...) @llvm.coro.suspend.async.sl_p0s(i32 0, ptr %i7, ptr nonnull @__swift_async_resume_get_context, ptr nonnull @coroutineA.1, ptr %i7, i64 0, i64 0, ptr %arg), !dbg !7
+  call void @dont_optimize(ptr %var_with_dbg_value, ptr %var_with_dbg_declare), !dbg !7
+  unreachable, !dbg !7
+}
+
+define weak_odr hidden ptr @__swift_async_resume_get_context(ptr %arg) !dbg !8 {
+  ret ptr %arg, !dbg !9
+}
+
+define hidden swifttailcc void @coroutineA.1(ptr %arg, i64 %arg1, i64 %arg2, ptr %arg3) !dbg !10 {
+  ret void, !dbg !11
+}
+
+declare void @dont_optimize(ptr, ptr)
+
+; Function Attrs: nomerge nounwind
+declare ptr @llvm.coro.async.resume() #1
+
+; Function Attrs: nounwind
+declare ptr @llvm.coro.begin(token, ptr writeonly) #2
+
+; Function Attrs: nounwind
+declare token @llvm.coro.id.async(i32, i32, i32, ptr) #2
+
+; Function Attrs: nomerge nounwind
+declare { ptr } @llvm.coro.suspend.async.sl_p0s(i32, ptr, ptr, ...) #1
+
+attributes #0 = { presplitcoroutine }
+attributes #1 = { nomerge nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
+!2 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !3, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "blah", directory: "")
+!4 = !{}
+!5 = !DILocalVariable(scope: !1, type: !6)
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Klass")
+!7 = !DILocation(line: 0, scope: !1)
+!8 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !2)
+!9 = !DILocation(line: 0, scope: !8)
+!10 = distinct !DISubprogram(scope: null, spFlags: DISPFlagDefinition, unit: !2)
+!11 = !DILocation(line: 0, scope: !10)
\ No newline at end of file
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index 212a5c99676f4..877484f5159fd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -63,7 +63,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) {
 ; CHECK-NEXT:    store i32 [[STORE]], ptr [[NBRBOXES]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp sgt i32 [[IV]], [[IBOX]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -114,7 +114,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; RVA23-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; RVA23-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; RVA23:       middle.block:
 ; RVA23-NEXT:    br label [[LOOP:%.*]]
 ; RVA23:       exit:
@@ -141,7 +141,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23ZVL1024B-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; RVA23ZVL1024B-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; RVA23ZVL1024B-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; RVA23ZVL1024B:       middle.block:
 ; RVA23ZVL1024B-NEXT:    br label [[LOOP:%.*]]
 ; RVA23ZVL1024B:       exit:
@@ -185,16 +185,16 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT1]], <vscale x 2 x ptr> align 8 [[TMP5]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> align 4 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> zeroinitializer, <vscale x 2 x ptr> align 1 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar-widen-gep-scalable.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar-widen-gep-scalable.ll
new file mode 100644
index 0000000000000..6746e92cc1fd1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar-widen-gep-scalable.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph" --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 \
+; RUN:  -force-target-supports-scalable-vectors=true \
+; RUN:  -scalable-vectorization=preferred -S %s | FileCheck %s
+
+define void @widengep_narrow(ptr %in, ptr noalias %p) {
+; CHECK-LABEL: define void @widengep_narrow(
+; CHECK-SAME: ptr [[IN:%.*]], ptr noalias [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[IN]], i64 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[TMP7]], i32 [[TMP10]]
+; CHECK-NEXT:    store ptr [[TMP11]], ptr [[P]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.in.off = getelementptr i8, ptr %in, i64 8
+  %gep.in.iv = getelementptr i32, ptr %gep.in.off, i64 %iv
+  store ptr %gep.in.iv, ptr %p
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 1024
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
new file mode 100644
index 0000000000000..00eeb69dcb0f7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses -S | FileCheck %s
+
+define void @pr128062(ptr %dst.start, i8 %a, i16 %b) {
+; CHECK-LABEL: define void @pr128062(
+; CHECK-SAME: ptr [[DST_START:%.*]], i8 [[A:%.*]], i16 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %dst = phi ptr [ %dst.start, %entry ], [ %dst.next, %loop ]
+  %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
+  %load.dst = load i8, ptr %dst, align 1
+  %dst.ext = zext i8 %load.dst to i16
+  %mul.dst.0 = mul nuw i16 %dst.ext, %b
+  %udiv.0 = udiv i16 %mul.dst.0, 255
+  %trunc.0 = trunc nuw i16 %udiv.0 to i8
+  %val.0 = add i8 %a, %trunc.0
+  store i8 %val.0, ptr %dst, align 1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
+  %load.dst.1 = load i8, ptr %gep.dst.1, align 1
+  %dst.1.ext = zext i8 %load.dst.1 to i16
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %b
+  %udiv.1 = udiv i16 %mul.dst.1, 255
+  %trunc.1 = trunc nuw i16 %udiv.1 to i8
+  %val.1 = add i8 %a, %trunc.1
+  store i8 %val.1, ptr %gep.dst.1, align 1
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
+  %load.dst.2 = load i8, ptr %gep.dst.2, align 1
+  %dst.2.ext = zext i8 %load.dst.2 to i16
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %b
+  %udiv.2 = udiv i16 %mul.dst.2, 255
+  %trunc.2 = trunc nuw i16 %udiv.2 to i8
+  %val.2 = add i8 %a, %trunc.2
+  store i8 %val.2, ptr %gep.dst.2, align 1
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
+  %load.dst.3 = load i8, ptr %gep.dst.3, align 1
+  %dst.3.ext = zext i8 %load.dst.3 to i16
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %b
+  %udiv.3 = udiv i16 %mul.dst.3, 255
+  %trunc.3 = trunc nuw i16 %udiv.3 to i8
+  %val.3 = add i8 %a, %trunc.3
+  store i8 %val.3, ptr %gep.dst.3, align 1
+  %iv.next = add i64 %iv, 4
+  %exit.cond = icmp eq i64 %iv.next, 256
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Same as above, except one zext is replaced with an sext.
+define void @opcode_mismatch(ptr %dst.start, i8 %a, i16 %b) {
+; CHECK-LABEL: define void @opcode_mismatch(
+; CHECK-SAME: ptr [[DST_START:%.*]], i8 [[A:%.*]], i16 [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %dst = phi ptr [ %dst.start, %entry ], [ %dst.next, %loop ]
+  %dst.next = getelementptr inbounds nuw i8, ptr %dst, i64 4
+  %load.dst = load i8, ptr %dst, align 1
+  %dst.ext = zext i8 %load.dst to i16
+  %mul.dst.0 = mul nuw i16 %dst.ext, %b
+  %udiv.0 = udiv i16 %mul.dst.0, 255
+  %trunc.0 = trunc nuw i16 %udiv.0 to i8
+  %val.0 = add i8 %a, %trunc.0
+  store i8 %val.0, ptr %dst, align 1
+  %gep.dst.1 = getelementptr inbounds nuw i8, ptr %dst, i64 1
+  %load.dst.1 = load i8, ptr %gep.dst.1, align 1
+  %dst.1.ext = sext i8 %load.dst.1 to i16
+  %mul.dst.1 = mul nuw i16 %dst.1.ext, %b
+  %udiv.1 = udiv i16 %mul.dst.1, 255
+  %trunc.1 = trunc nuw i16 %udiv.1 to i8
+  %val.1 = add i8 %a, %trunc.1
+  store i8 %val.1, ptr %gep.dst.1, align 1
+  %gep.dst.2 = getelementptr inbounds nuw i8, ptr %dst, i64 2
+  %load.dst.2 = load i8, ptr %gep.dst.2, align 1
+  %dst.2.ext = zext i8 %load.dst.2 to i16
+  %mul.dst.2 = mul nuw i16 %dst.2.ext, %b
+  %udiv.2 = udiv i16 %mul.dst.2, 255
+  %trunc.2 = trunc nuw i16 %udiv.2 to i8
+  %val.2 = add i8 %a, %trunc.2
+  store i8 %val.2, ptr %gep.dst.2, align 1
+  %gep.dst.3 = getelementptr inbounds nuw i8, ptr %dst, i64 3
+  %load.dst.3 = load i8, ptr %gep.dst.3, align 1
+  %dst.3.ext = zext i8 %load.dst.3 to i16
+  %mul.dst.3 = mul nuw i16 %dst.3.ext, %b
+  %udiv.3 = udiv i16 %mul.dst.3, 255
+  %trunc.3 = trunc nuw i16 %udiv.3 to i8
+  %val.3 = add i8 %a, %trunc.3
+  store i8 %val.3, ptr %gep.dst.3, align 1
+  %iv.next = add i64 %iv, 4
+  %exit.cond = icmp eq i64 %iv.next, 256
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index 9bb010c0431d8..90ef97609e096 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -8,14 +8,14 @@ define void @pr63340(ptr %A, ptr %B) {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]]
-; CHECK-NEXT:    store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -55,11 +55,11 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[N]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, <4 x ptr> [[BROADCAST_SPLAT]], i64 [[N]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/tools/dsymutil/AArch64/dummy-debug-map-arm64.map b/llvm/test/tools/dsymutil/AArch64/dummy-debug-map-arm64.map
index 50d860290422c..bd2b2014ee22c 100644
--- a/llvm/test/tools/dsymutil/AArch64/dummy-debug-map-arm64.map
+++ b/llvm/test/tools/dsymutil/AArch64/dummy-debug-map-arm64.map
@@ -11,9 +11,13 @@ objects:
   - filename: 1.o
     symbols:
       - { sym: _bar, objAddr: 0x0, binAddr: 0x10000, size: 0x10 }
+      - { sym: __Z13lib1_internalv, objAddr: 0x0, binAddr: 0x10020, size: 0x20 }
+      - { sym: __ZN3Foo4funcIZ13lib1_internalvE3$_0EEvv, objAddr: 0x0, binAddr: 0x10040, size: 0x20 }
   - filename: 2.o
     symbols:
       - { sym: __Z3foov, objAddr: 0x0, binAddr: 0x20000, size: 0x10 }
+      - { sym: __Z13lib1_internalv, objAddr: 0x0, binAddr: 0x20020, size: 0x20 }
+      - { sym: __ZN3Foo4funcIZ13lib1_internalvE3$_0EEvv, objAddr: 0x0, binAddr: 0x20040, size: 0x20 }
   - filename: 3.o
     symbols:
       - { sym: __Z3foov, objAddr: 0x0, binAddr: 0x30000, size: 0x10 }
diff --git a/llvm/test/tools/dsymutil/AArch64/dwarf5-str-offsets-base-strx.test b/llvm/test/tools/dsymutil/AArch64/dwarf5-str-offsets-base-strx.test
index c0c4fe835682f..c5110a873c603 100644
--- a/llvm/test/tools/dsymutil/AArch64/dwarf5-str-offsets-base-strx.test
+++ b/llvm/test/tools/dsymutil/AArch64/dwarf5-str-offsets-base-strx.test
@@ -98,7 +98,7 @@ CHECK:               DW_AT_str_offsets_base [DW_FORM_sec_offset]       (0x000000
 CHECK:               DW_AT_comp_dir [DW_FORM_strx]     (indexed (00000004) string = "/Users/shubham/Development/test109275485")
 
 CHECK:   DW_TAG_subprogram
-CHECK:                 DW_AT_low_pc [DW_FORM_addrx]    (indexed (00000000) address = 0x0000000000010000)
+CHECK:                 DW_AT_low_pc [DW_FORM_addrx]    (indexed (00000000) address = 0x0000000000010040)
 CHECK:                 DW_AT_linkage_name [DW_FORM_strx]       (indexed (00000005) string = "_Z4foo2i")
 CHECK:                 DW_AT_name [DW_FORM_strx]       (indexed (00000006) string = "foo2")
 
diff --git a/llvm/test/tools/dsymutil/AArch64/inlined-low_pc.c b/llvm/test/tools/dsymutil/AArch64/inlined-low_pc.c
index d2d36f675e8b7..b89a6f99ebcb3 100644
--- a/llvm/test/tools/dsymutil/AArch64/inlined-low_pc.c
+++ b/llvm/test/tools/dsymutil/AArch64/inlined-low_pc.c
@@ -10,10 +10,10 @@ int bar(int a) { return foo(a); }
 // RUN: llvm-dwarfdump - | FileCheck %s
 
 // CHECK: DW_TAG_subprogram
-// CHECK: DW_AT_low_pc{{.*}}0x0000000000010000
+// CHECK: DW_AT_low_pc{{.*}}0x0000000000010040
 // CHECK: DW_AT_name{{.*}}"bar"
 // CHECK-NOT: NULL
 // CHECK: DW_TAG_inlined_subroutine
 // CHECK-NEXT: DW_AT_abstract_origin{{.*}}"foo"
-// CHECK-NEXT: DW_AT_low_pc{{.*}}0x0000000000010000
+// CHECK-NEXT: DW_AT_low_pc{{.*}}0x0000000000010040
 
diff --git a/llvm/test/tools/dsymutil/AArch64/odr-uniquing-DW_AT_name-conflict.test b/llvm/test/tools/dsymutil/AArch64/odr-uniquing-DW_AT_name-conflict.test
new file mode 100644
index 0000000000000..b6edb8bca3194
--- /dev/null
+++ b/llvm/test/tools/dsymutil/AArch64/odr-uniquing-DW_AT_name-conflict.test
@@ -0,0 +1,28 @@
+# Tests the case where a DW_TAG_subprogram for a method declaration
+# got uniqued into a DW_TAG_subprogram with the same linkage name (but
+# different DW_AT_name). Make sure the DW_TAG_subprogram DIE for the
+# definition, which previously pointed to the now de-deduplicated declaration,
+# gets inserted into the .debug_names table using the DW_AT_name of the canonical
+# declaration DW_TAG_subprogram.
+#
+# Object files compiled as follows:
+#   clang -g -c -o 1.o Inputs/odr-uniquing-DW_AT_name-conflict/lib1.cpp
+#   clang -g -c -o 2.o Inputs/odr-uniquing-DW_AT_name-conflict/lib2.cpp
+
+# RUN: dsymutil -f -oso-prepend-path=%p/../Inputs/odr-uniquing-DW_AT_name-conflict -y %p/dummy-debug-map-arm64.map -o - \
+# RUN:     | llvm-dwarfdump --verify - | FileCheck %s
+
+# RUN: dsymutil --linker parallel -f -oso-prepend-path=%p/../Inputs/odr-uniquing-DW_AT_name-conflict -y %p/dummy-debug-map-arm64.map -o - \
+# RUN:     | not llvm-dwarfdump --verify - | FileCheck %s --check-prefix=PARALLEL-ODR
+
+# RUN: dsymutil -f -oso-prepend-path=%p/../Inputs/odr-uniquing-DW_AT_name-conflict -y %p/dummy-debug-map-arm64.map -no-odr -o - \
+# RUN:     | llvm-dwarfdump --verify - | FileCheck %s
+
+# RUN: dsymutil --linker parallel -f -oso-prepend-path=%p/../Inputs/odr-uniquing-DW_AT_name-conflict -y %p/dummy-debug-map-arm64.map -no-odr -o - \
+# RUN:     | llvm-dwarfdump --verify - | FileCheck %s
+
+# CHECK: No errors.
+
+# FIXME: parallel DWARFLinker uses wrong DW_AT_name when inserting uniqued subprogram into .debug_names
+# PARALLEL-ODR:      Verifying .debug_names...
+# PARALLEL-ODR-NEXT: error: Name Index {{.*}} mismatched Name of DIE
diff --git a/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/1.o b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/1.o
new file mode 100644
index 0000000000000..5932a3c89aaeb
Binary files /dev/null and b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/1.o differ
diff --git a/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/2.o b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/2.o
new file mode 100644
index 0000000000000..607b47059e982
Binary files /dev/null and b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/2.o differ
diff --git a/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib1.cpp b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib1.cpp
new file mode 100644
index 0000000000000..4cf90f071ee8c
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib1.cpp
@@ -0,0 +1,5 @@
+#include "lib1.h"
+
+[[gnu::weak]] void lib1_internal() {
+  Foo{}.func<decltype([]{})>();
+}
diff --git a/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib1.h b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib1.h
new file mode 100644
index 0000000000000..3b3cefbaeac17
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib1.h
@@ -0,0 +1,3 @@
+struct Foo {
+  template<typename T> void func() {}
+};
diff --git a/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib2.cpp b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib2.cpp
new file mode 100644
index 0000000000000..4cf90f071ee8c
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/lib2.cpp
@@ -0,0 +1,5 @@
+#include "lib1.h"
+
+[[gnu::weak]] void lib1_internal() {
+  Foo{}.func<decltype([]{})>();
+}
diff --git a/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/main.cpp b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/main.cpp
new file mode 100644
index 0000000000000..77f2cc4c8affe
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/odr-uniquing-DW_AT_name-conflict/main.cpp
@@ -0,0 +1,6 @@
+[[gnu::weak]] void lib1_internal();
+
+int main() {
+    lib1_internal();
+    __builtin_debugtrap();
+}
diff --git a/llvm/tools/bugpoint/BugDriver.h b/llvm/tools/bugpoint/BugDriver.h
index ca57405f9d770..71a5aa14bbb2e 100644
--- a/llvm/tools/bugpoint/BugDriver.h
+++ b/llvm/tools/bugpoint/BugDriver.h
@@ -16,6 +16,7 @@
 #define LLVM_TOOLS_BUGPOINT_BUGDRIVER_H
 
 #include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -41,6 +42,10 @@ extern bool DisableSimplifyCFG;
 ///
 extern bool BugpointIsInterrupted;
 
+/// Command line options used across files.
+extern cl::list<std::string> InputArgv;
+extern cl::opt<std::string> OutputPrefix;
+
 class BugDriver {
   LLVMContext &Context;
   const char *ToolName;            // argv[0] of bugpoint
diff --git a/llvm/tools/bugpoint/ExecutionDriver.cpp b/llvm/tools/bugpoint/ExecutionDriver.cpp
index 8c6b7fbe50c7c..96eeb35b4db70 100644
--- a/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -13,7 +13,6 @@
 
 #include "BugDriver.h"
 #include "ToolRunner.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Program.h"
@@ -102,15 +101,13 @@ static cl::opt<std::string> CustomExecCommand(
 
 // Anything specified after the --args option are taken as arguments to the
 // program being debugged.
-namespace llvm {
-cl::list<std::string> InputArgv("args", cl::Positional,
-                                cl::desc("<program arguments>..."),
-                                cl::PositionalEatsArgs);
-
-cl::opt<std::string>
-    OutputPrefix("output-prefix", cl::init("bugpoint"),
-                 cl::desc("Prefix to use for outputs (default: 'bugpoint')"));
-} // namespace llvm
+cl::list<std::string> llvm::InputArgv("args", cl::Positional,
+                                      cl::desc("<program arguments>..."),
+                                      cl::PositionalEatsArgs);
+
+cl::opt<std::string> llvm::OutputPrefix(
+    "output-prefix", cl::init("bugpoint"),
+    cl::desc("Prefix to use for outputs (default: 'bugpoint')"));
 
 static cl::list<std::string> ToolArgv("tool-args", cl::Positional,
                                       cl::desc("<tool arguments>..."),
diff --git a/llvm/tools/bugpoint/ExtractFunction.cpp b/llvm/tools/bugpoint/ExtractFunction.cpp
index 3206589ff38f2..31cdd0d43f2fc 100644
--- a/llvm/tools/bugpoint/ExtractFunction.cpp
+++ b/llvm/tools/bugpoint/ExtractFunction.cpp
@@ -36,9 +36,6 @@ using namespace llvm;
 #define DEBUG_TYPE "bugpoint"
 
 bool llvm::DisableSimplifyCFG = false;
-namespace llvm {
-extern cl::opt<std::string> OutputPrefix;
-} // namespace llvm
 
 static cl::opt<bool>
     NoDCE("disable-dce",
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index a7f1643aecf15..dcad126d87865 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -28,11 +28,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-extern cl::opt<std::string> OutputPrefix;
-extern cl::list<std::string> InputArgv;
-} // end namespace llvm
-
 static cl::opt<bool> DisableLoopExtraction(
     "disable-loop-extraction",
     cl::desc("Don't extract loops when searching for miscompilations"),
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index bf2e8c0b4a910..191f87c08a0f6 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -34,10 +34,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bugpoint"
 
-namespace llvm {
-extern cl::opt<std::string> OutputPrefix;
-}
-
 static cl::opt<std::string>
     OptCmd("opt-command", cl::init(""),
            cl::desc("Path to opt. (default: search path "
diff --git a/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp b/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
index cffaf7c9543fb..20ae253513f05 100644
--- a/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
+++ b/llvm/unittests/DebugInfo/PDB/NativeSessionTest.cpp
@@ -40,6 +40,18 @@ TEST(NativeSessionTest, TestCreateFromExe) {
   ASSERT_THAT_ERROR(std::move(E), Succeeded());
 }
 
+TEST(NativeSessionTest, TestInvalidPdbMagicError) {
+  SmallString<128> InputsDir = unittest::getInputFileDirectory(TestMainArgv0);
+  llvm::sys::path::append(InputsDir, "SimpleTest.cpp");
+  std::string CppPath{InputsDir};
+  std::unique_ptr<IPDBSession> S;
+
+  Error E = NativeSession::createFromPdbPath(CppPath, S);
+  const char *FormatErr = "The record is in an unexpected format. "
+                          "The input file did not contain the pdb file magic.";
+  ASSERT_THAT_ERROR(std::move(E), FailedWithMessage(FormatErr));
+}
+
 TEST(NativeSessionTest, TestSetLoadAddress) {
   std::unique_ptr<IPDBSession> S;
   Error E = pdb::loadDataForEXE(PDB_ReaderType::Native, getExePath(), S);
diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index 66dbf6152472a..9b524e2ef7cd5 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -29,6 +29,8 @@ static_library("Driver") {
   sources = [
     "Action.cpp",
     "Compilation.cpp",
+    "CreateASTUnitFromArgs.cpp",
+    "CreateInvocationFromArgs.cpp",
     "Distro.cpp",
     "Driver.cpp",
     "Job.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
index 4009cfc609f4a..cdf39d645bc52 100644
--- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
@@ -28,7 +28,6 @@ static_library("Frontend") {
     "ChainedIncludesSource.cpp",
     "CompilerInstance.cpp",
     "CompilerInvocation.cpp",
-    "CreateInvocationFromCommandLine.cpp",
     "DependencyFile.cpp",
     "DependencyGraph.cpp",
     "DiagnosticRenderer.cpp",
@@ -48,6 +47,7 @@ static_library("Frontend") {
     "SARIFDiagnosticPrinter.cpp",
     "SerializedDiagnosticPrinter.cpp",
     "SerializedDiagnosticReader.cpp",
+    "StandaloneDiagnostic.cpp",
     "TestModuleFileExtension.cpp",
     "TextDiagnostic.cpp",
     "TextDiagnosticBuffer.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 09d2f1ed92554..82fe916645635 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1167,7 +1167,6 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/locale_base_api.h",
       "__locale_dir/locale_base_api/bsd_locale_fallbacks.h",
       "__locale_dir/locale_base_api/ibm.h",
-      "__locale_dir/locale_base_api/musl.h",
       "__locale_dir/locale_base_api/openbsd.h",
       "__locale_dir/messages.h",
       "__locale_dir/money.h",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index ad72c0069237d..e54797e188a11 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -165,6 +165,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVVectorMaskDAGMutation.cpp",
     "RISCVVectorPeephole.cpp",
     "RISCVZacasABIFix.cpp",
+    "RISCVZilsdOptimizer.cpp",
   ]
 }
 
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index c1ade9ed8617c..cc7f09f71d028 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -23,6 +23,8 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(LLVM, llvm);
 MLIR_CAPI_EXPORTED MlirType mlirLLVMPointerTypeGet(MlirContext ctx,
                                                    unsigned addressSpace);
 
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMPointerTypeGetTypeID(void);
+
 /// Returns `true` if the type is an LLVM dialect pointer type.
 MLIR_CAPI_EXPORTED bool mlirTypeIsALLVMPointerType(MlirType type);
 
@@ -58,6 +60,8 @@ MLIR_CAPI_EXPORTED MlirType mlirLLVMFunctionTypeGetReturnType(MlirType type);
 /// Returns `true` if the type is an LLVM dialect struct type.
 MLIR_CAPI_EXPORTED bool mlirTypeIsALLVMStructType(MlirType type);
 
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMStructTypeGetTypeID(void);
+
 /// Returns `true` if the type is a literal (unnamed) LLVM struct type.
 MLIR_CAPI_EXPORTED bool mlirLLVMStructTypeIsLiteral(MlirType type);
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 05d2316711c8a..601fc1a594768 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -177,6 +177,10 @@ static constexpr StringLiteral getRoutineInfoAttrName() {
   return StringLiteral("acc.routine_info");
 }
 
+static constexpr StringLiteral getFromDefaultClauseAttrName() {
+  return StringLiteral("acc.from_default");
+}
+
 static constexpr StringLiteral getVarNameAttrName() {
   return VarNameAttr::name;
 }
diff --git a/mlir/lib/Bindings/Python/DialectLLVM.cpp b/mlir/lib/Bindings/Python/DialectLLVM.cpp
index 870a713b8edcb..05681cecf82b3 100644
--- a/mlir/lib/Bindings/Python/DialectLLVM.cpp
+++ b/mlir/lib/Bindings/Python/DialectLLVM.cpp
@@ -31,8 +31,8 @@ static void populateDialectLLVMSubmodule(nanobind::module_ &m) {
   // StructType
   //===--------------------------------------------------------------------===//
 
-  auto llvmStructType =
-      mlir_type_subclass(m, "StructType", mlirTypeIsALLVMStructType);
+  auto llvmStructType = mlir_type_subclass(
+      m, "StructType", mlirTypeIsALLVMStructType, mlirLLVMStructTypeGetTypeID);
 
   llvmStructType
       .def_classmethod(
@@ -137,7 +137,8 @@ static void populateDialectLLVMSubmodule(nanobind::module_ &m) {
   // PointerType
   //===--------------------------------------------------------------------===//
 
-  mlir_type_subclass(m, "PointerType", mlirTypeIsALLVMPointerType)
+  mlir_type_subclass(m, "PointerType", mlirTypeIsALLVMPointerType,
+                     mlirLLVMPointerTypeGetTypeID)
       .def_classmethod(
           "get",
           [](const nb::object &cls, std::optional<unsigned> addressSpace,
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index 6636f0ea73ec9..bf231767320a5 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -27,6 +27,10 @@ MlirType mlirLLVMPointerTypeGet(MlirContext ctx, unsigned addressSpace) {
   return wrap(LLVMPointerType::get(unwrap(ctx), addressSpace));
 }
 
+MlirTypeID mlirLLVMPointerTypeGetTypeID() {
+  return wrap(LLVM::LLVMPointerType::getTypeID());
+}
+
 bool mlirTypeIsALLVMPointerType(MlirType type) {
   return isa<LLVM::LLVMPointerType>(unwrap(type));
 }
@@ -73,6 +77,10 @@ bool mlirTypeIsALLVMStructType(MlirType type) {
   return isa<LLVM::LLVMStructType>(unwrap(type));
 }
 
+MlirTypeID mlirLLVMStructTypeGetTypeID() {
+  return wrap(LLVM::LLVMStructType::getTypeID());
+}
+
 bool mlirLLVMStructTypeIsLiteral(MlirType type) {
   return !cast<LLVM::LLVMStructType>(unwrap(type)).isIdentified();
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index ce93d18f56d39..5dc4fa2b2d82f 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -667,6 +667,7 @@ LogicalResult LLVMStructType::verifyEntries(DataLayoutEntryListRef entries,
 
 static constexpr llvm::StringRef kSpirvPrefix = "spirv.";
 static constexpr llvm::StringRef kArmSVCount = "aarch64.svcount";
+static constexpr llvm::StringRef kAMDGCNNamedBarrier = "amdgcn.named.barrier";
 
 bool LLVM::LLVMTargetExtType::hasProperty(Property prop) const {
   // See llvm/lib/IR/Type.cpp for reference.
@@ -676,6 +677,9 @@ bool LLVM::LLVMTargetExtType::hasProperty(Property prop) const {
     properties |=
         (LLVMTargetExtType::HasZeroInit | LLVM::LLVMTargetExtType::CanBeGlobal);
 
+  if (getExtTypeName() == kAMDGCNNamedBarrier)
+    properties |= LLVMTargetExtType::CanBeGlobal;
+
   return (properties & prop) == prop;
 }
 
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
index 91262bd76ca31..7d729619b3f21 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitData.cpp
@@ -570,6 +570,8 @@ Operation *ACCImplicitData::generateDataClauseOpForCandidate(
       newDataOp = acc::PresentOp::create(builder, loc, var,
                                          /*structured=*/true, /*implicit=*/true,
                                          accSupport.getVariableName(var));
+      newDataOp->setAttr(acc::getFromDefaultClauseAttrName(),
+                         builder.getUnitAttr());
     } else {
       auto copyinOp =
           acc::CopyinOp::create(builder, loc, var,
diff --git a/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir b/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
index cf09c33ca5197..06c1c3cadd4ba 100644
--- a/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
+++ b/mlir/test/Dialect/OpenACC/acc-implicit-data.mlir
@@ -110,7 +110,7 @@ func.func @test_array_parallel_defaultpresent() {
 }
 
 // CHECK-LABEL: func.func @test_array_parallel_defaultpresent
-// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : memref<10xf32>) -> memref<10xf32> {implicit = true, name = ""}
+// CHECK: %[[PRESENT:.*]] = acc.present varPtr({{.*}} : memref<10xf32>) -> memref<10xf32> {acc.from_default, implicit = true, name = ""}
 // CHECK: acc.delete accPtr(%[[PRESENT]] : memref<10xf32>) {dataClause = #acc<data_clause acc_present>, implicit = true, name = ""}
 
 // -----
diff --git a/mlir/test/Target/LLVMIR/target-ext-type.mlir b/mlir/test/Target/LLVMIR/target-ext-type.mlir
index 6b2d2ea3d4c23..cee630163ca21 100644
--- a/mlir/test/Target/LLVMIR/target-ext-type.mlir
+++ b/mlir/test/Target/LLVMIR/target-ext-type.mlir
@@ -6,6 +6,12 @@ llvm.mlir.global external @global() {addr_space = 0 : i32} : !llvm.target<"spirv
   llvm.return %0 : !llvm.target<"spirv.DeviceEvent">
 }
 
+// CHECK: @amdgcn_named_barrier = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+llvm.mlir.global internal @amdgcn_named_barrier() {addr_space = 3 : i32} : !llvm.target<"amdgcn.named.barrier", 0> {
+  %0 = llvm.mlir.poison : !llvm.target<"amdgcn.named.barrier", 0>
+  llvm.return %0 : !llvm.target<"amdgcn.named.barrier", 0>
+}
+
 // CHECK-LABEL: define target("spirv.Event") @func2() {
 // CHECK-NEXT:    %1 = alloca target("spirv.Event"), align 8
 // CHECK-NEXT:    %2 = load target("spirv.Event"), ptr %1, align 8
diff --git a/mlir/test/python/dialects/llvm.py b/mlir/test/python/dialects/llvm.py
index 8ea0fddee3f7c..305ed9aba940d 100644
--- a/mlir/test/python/dialects/llvm.py
+++ b/mlir/test/python/dialects/llvm.py
@@ -98,6 +98,9 @@ def testStructType():
     assert opaque.opaque
     # CHECK: !llvm.struct<"opaque", opaque>
 
+    typ = Type.parse('!llvm.struct<"zoo", (i32, i64)>')
+    assert isinstance(typ, llvm.StructType)
+
 
 # CHECK-LABEL: testSmoke
 @constructAndPrintInModule
@@ -120,6 +123,9 @@ def testPointerType():
     # CHECK: !llvm.ptr<1>
     print(ptr_with_addr)
 
+    typ = Type.parse("!llvm.ptr<1>")
+    assert isinstance(typ, llvm.PointerType)
+
 
 # CHECK-LABEL: testConstant
 @constructAndPrintInModule