diff --git a/.ci/generate_test_report_github.py b/.ci/generate_test_report_github.py
index 6785e82f3440b..08387de817467 100644
--- a/.ci/generate_test_report_github.py
+++ b/.ci/generate_test_report_github.py
@@ -8,6 +8,7 @@
 
 import generate_test_report_lib
 
+
 def compute_platform_title() -> str:
     logo = ":window:" if platform.system() == "Windows" else ":penguin:"
     # On Linux the machine value is x86_64 on Windows it is AMD64.
diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 7820fbda803d7..0c025c561f6f7 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -100,6 +100,7 @@ def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]:
         )
     return output
 
+
 def get_failures(junit_objects) -> dict[str, list[tuple[str, str]]]:
     failures = {}
     for results in junit_objects:
diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index 4b827b647b06c..f872db2cae0ce 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -1018,41 +1018,15 @@ void Relocation::print(raw_ostream &OS) const {
   default:
     OS << "RType:" << Twine::utohexstr(Type);
     break;
-
-  case Triple::aarch64: {
-    static const char *const AArch64RelocNames[] = {
-#define ELF_RELOC(name, value) #name,
-#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
-#undef ELF_RELOC
-    };
-    assert(Type < ArrayRef(AArch64RelocNames).size());
-    OS << AArch64RelocNames[Type];
-  } break;
-
+  case Triple::aarch64:
+    OS << object::getELFRelocationTypeName(ELF::EM_AARCH64, Type);
+    break;
   case Triple::riscv64:
-    // RISC-V relocations are not sequentially numbered so we cannot use an
-    // array
-    switch (Type) {
-    default:
-      llvm_unreachable("illegal RISC-V relocation");
-#define ELF_RELOC(name, value)                                                 \
-  case value:                                                                  \
-    OS << #name;                                                               \
+    OS << object::getELFRelocationTypeName(ELF::EM_RISCV, Type);
     break;
-#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
-#undef ELF_RELOC
-    }
+  case Triple::x86_64:
+    OS << object::getELFRelocationTypeName(ELF::EM_X86_64, Type);
     break;
-
-  case Triple::x86_64: {
-    static const char *const X86RelocNames[] = {
-#define ELF_RELOC(name, value) #name,
-#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
-#undef ELF_RELOC
-    };
-    assert(Type < ArrayRef(X86RelocNames).size());
-    OS << X86RelocNames[Type];
-  } break;
   }
   OS << ", 0x" << Twine::utohexstr(Offset);
   if (Symbol) {
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 57db6a436c5c6..3c77091d91ebd 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -640,7 +640,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     Insts[1].addOperand(MCOperand::createImm(0));
     Insts[1].addOperand(MCOperand::createImm(0));
     setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx,
-                          ELF::R_AARCH64_ADD_ABS_LO12_NC);
+                          isLDRXl(LDRInst) ? ELF::R_AARCH64_LDST64_ABS_LO12_NC
+                                           : ELF::R_AARCH64_LDST32_ABS_LO12_NC);
     return Insts;
   }
 
diff --git a/bolt/test/AArch64/relocation-type-print.s b/bolt/test/AArch64/relocation-type-print.s
new file mode 100644
index 0000000000000..111cbbb94bc54
--- /dev/null
+++ b/bolt/test/AArch64/relocation-type-print.s
@@ -0,0 +1,24 @@
+## Verify that llvm-bolt correctly prints relocation types.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -nostartfiles %s -o %t.exe -Wl,-q,--no-relax
+# RUN: llvm-bolt %t.exe --print-cfg --print-relocations -o %t.bolt \
+# RUN:   | FileCheck %s
+
+  .section .text
+  .align 4
+  .globl _start
+  .type _start, %function
+_start:
+
+  adrp x0, _start
+# CHECK: adrp
+# CHECK-SAME: R_AARCH64_ADR_PREL_PG_HI21
+
+  add x0, x0, :lo12:_start
+# CHECK-NEXT: add
+# CHECK-SAME: R_AARCH64_ADD_ABS_LO12_NC
+
+  ret
+  .size _start, .-_start
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index badff299603a0..75066c855b9ed 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -81,14 +81,14 @@
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # Register move should NOT be inlined (size unknown at compile time)
-# CHECK-ASM-LABEL: <test_register_move_negative>:
+# CHECK-ASM-LABEL: <test_register_move_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
-# CHECK-ASM-LABEL: <test_x2_rewrite_negative>:
+# CHECK-ASM-LABEL: <test_x2_rewrite_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # Live-in parameter should NOT be inlined (size unknown at compile time)
-# CHECK-ASM-LABEL: <test_live_in_negative>:
+# CHECK-ASM-LABEL: <test_live_in_unknown>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
 # _memcpy8 should be inlined with end-pointer return (dest+size)
@@ -262,9 +262,9 @@ test_4_byte_add_immediate:
 	ret
 	.size	test_4_byte_add_immediate, .-test_4_byte_add_immediate
 
-	.globl	test_register_move_negative
-	.type	test_register_move_negative,@function
-test_register_move_negative:
+	.globl	test_register_move_unknown
+	.type	test_register_move_unknown,@function
+test_register_move_unknown:
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
 	add	x1, sp, #16
@@ -274,20 +274,20 @@ test_register_move_negative:
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_register_move_negative, .-test_register_move_negative
+	.size	test_register_move_unknown, .-test_register_move_unknown
 
-	.globl  test_x2_rewrite_negative
-	.type   test_x2_rewrite_negative,@function
-test_x2_rewrite_negative:
+	.globl  test_x2_rewrite_unknown
+	.type   test_x2_rewrite_unknown,@function
+test_x2_rewrite_unknown:
 	mov     x2, #8
 	ldr     x2, [sp, #24]
 	bl      memcpy
 	ret
-	.size   test_x2_rewrite_negative, .-test_x2_rewrite_negative
+	.size   test_x2_rewrite_unknown, .-test_x2_rewrite_unknown
 
-	.globl	test_live_in_negative
-	.type	test_live_in_negative,@function
-test_live_in_negative:
+	.globl	test_live_in_unknown
+	.type	test_live_in_unknown,@function
+test_live_in_unknown:
 	# x2 comes in as parameter, no instruction sets it (should NOT inline)
 	stp	x29, x30, [sp, #-32]!
 	mov	x29, sp
@@ -297,7 +297,7 @@ test_live_in_negative:
 	bl	memcpy
 	ldp	x29, x30, [sp], #32
 	ret
-	.size	test_live_in_negative, .-test_live_in_negative
+	.size	test_live_in_unknown, .-test_live_in_unknown
 
 	.globl	test_memcpy8_4_byte
 	.type	test_memcpy8_4_byte,@function
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2fdd041c1b46e..1be9a96aa44de 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3450,9 +3450,9 @@ Mac, and BSD. This attribute has no effect on other targets.
 def MSABIDocs : Documentation {
   let Category = DocCatCallingConvs;
   let Content = [{
-On non-Windows x86_64 targets, this attribute changes the calling convention of
-a function to match the default convention used on Windows x86_64. This
-attribute has no effect on Windows targets or non-x86_64 targets.
+On non-Windows x86_64 and aarch64 targets, this attribute changes the calling convention of
+a function to match the default convention used on Windows. This
+attribute has no effect on Windows targets or non-x86_64, non-aarch64 targets.
   }];
 }
 
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fbc227af33575..654e09c753109 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -180,7 +180,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")
 BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_add_i32, "iiQbiiIi", "")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32, "ffQbiiIi", "", "atomic-fadd-rtn-insts")
-TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "t", "atomic-buffer-global-pk-add-f16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16, "V2hV2hQbiiIi", "", "atomic-buffer-global-pk-add-f16-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmin_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f32, "ffQbiiIi", "", "atomic-fmin-fmax-global-f32")
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index e4044bcdfcc60..b27492d19a65b 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -46,6 +46,57 @@ class LangOptions;
 class MultiKeywordSelector;
 class SourceLocation;
 
+/// Constants for TokenKinds.def
+enum TokenKey : unsigned {
+  KEYC99 = 0x1,
+  KEYCXX = 0x2,
+  KEYCXX11 = 0x4,
+  KEYGNU = 0x8,
+  KEYMS = 0x10,
+  BOOLSUPPORT = 0x20,
+  KEYALTIVEC = 0x40,
+  KEYNOCXX = 0x80,
+  KEYBORLAND = 0x100,
+  KEYOPENCLC = 0x200,
+  KEYC23 = 0x400,
+  KEYNOMS18 = 0x800,
+  KEYNOOPENCL = 0x1000,
+  WCHARSUPPORT = 0x2000,
+  HALFSUPPORT = 0x4000,
+  CHAR8SUPPORT = 0x8000,
+  KEYOBJC = 0x10000,
+  KEYZVECTOR = 0x20000,
+  KEYCOROUTINES = 0x40000,
+  KEYMODULES = 0x80000,
+  KEYCXX20 = 0x100000,
+  KEYOPENCLCXX = 0x200000,
+  KEYMSCOMPAT = 0x400000,
+  KEYSYCL = 0x800000,
+  KEYCUDA = 0x1000000,
+  KEYZOS = 0x2000000,
+  KEYNOZOS = 0x4000000,
+  KEYHLSL = 0x8000000,
+  KEYFIXEDPOINT = 0x10000000,
+  KEYMAX = KEYFIXEDPOINT, // The maximum key
+  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
+  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
+           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
+};
+
+/// How a keyword is treated in the selected standard. This enum is ordered
+/// intentionally so that the value that 'wins' is the most 'permissive'.
+enum KeywordStatus {
+  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
+  KS_Disabled,  // Disabled
+  KS_Future,    // Is a keyword in future standard
+  KS_Extension, // Is an extension
+  KS_Enabled,   // Enabled
+};
+
+/// Translates flags as specified in TokenKinds.def into keyword status
+/// in the given language standard.
+KeywordStatus getKeywordStatus(const LangOptions &LangOpts, unsigned Flags);
+
 enum class ReservedIdentifierStatus {
   NotReserved = 0,
   StartsWithUnderscoreAtGlobalScope,
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index 6d9d074d78026..bc9e97863556d 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1409,10 +1409,15 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// before calling this method.
   unsigned getColumnNumber(FileID FID, unsigned FilePos,
                            bool *Invalid = nullptr) const;
+  unsigned getColumnNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
   unsigned getSpellingColumnNumber(SourceLocation Loc,
-                                   bool *Invalid = nullptr) const;
+                                   bool *Invalid = nullptr) const {
+    return getColumnNumber(getSpellingLoc(Loc), Invalid);
+  }
   unsigned getExpansionColumnNumber(SourceLocation Loc,
-                                    bool *Invalid = nullptr) const;
+                                    bool *Invalid = nullptr) const {
+    return getColumnNumber(getExpansionLoc(Loc), Invalid);
+  }
   unsigned getPresumedColumnNumber(SourceLocation Loc,
                                    bool *Invalid = nullptr) const;
 
@@ -1423,8 +1428,15 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// MemoryBuffer, so this is not cheap: use only when about to emit a
   /// diagnostic.
   unsigned getLineNumber(FileID FID, unsigned FilePos, bool *Invalid = nullptr) const;
-  unsigned getSpellingLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
-  unsigned getExpansionLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
+  unsigned getLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
+  unsigned getSpellingLineNumber(SourceLocation Loc,
+                                 bool *Invalid = nullptr) const {
+    return getLineNumber(getSpellingLoc(Loc), Invalid);
+  }
+  unsigned getExpansionLineNumber(SourceLocation Loc,
+                                  bool *Invalid = nullptr) const {
+    return getLineNumber(getExpansionLoc(Loc), Invalid);
+  }
   unsigned getPresumedLineNumber(SourceLocation Loc, bool *Invalid = nullptr) const;
 
   /// Return the filename or buffer identifier of the buffer the
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 4a2b77cd16bfc..d1c959b9687c4 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -77,57 +77,6 @@ IdentifierTable::IdentifierTable(const LangOptions &LangOpts,
 // Language Keyword Implementation
 //===----------------------------------------------------------------------===//
 
-// Constants for TokenKinds.def
-namespace {
-
-enum TokenKey : unsigned {
-  KEYC99 = 0x1,
-  KEYCXX = 0x2,
-  KEYCXX11 = 0x4,
-  KEYGNU = 0x8,
-  KEYMS = 0x10,
-  BOOLSUPPORT = 0x20,
-  KEYALTIVEC = 0x40,
-  KEYNOCXX = 0x80,
-  KEYBORLAND = 0x100,
-  KEYOPENCLC = 0x200,
-  KEYC23 = 0x400,
-  KEYNOMS18 = 0x800,
-  KEYNOOPENCL = 0x1000,
-  WCHARSUPPORT = 0x2000,
-  HALFSUPPORT = 0x4000,
-  CHAR8SUPPORT = 0x8000,
-  KEYOBJC = 0x10000,
-  KEYZVECTOR = 0x20000,
-  KEYCOROUTINES = 0x40000,
-  KEYMODULES = 0x80000,
-  KEYCXX20 = 0x100000,
-  KEYOPENCLCXX = 0x200000,
-  KEYMSCOMPAT = 0x400000,
-  KEYSYCL = 0x800000,
-  KEYCUDA = 0x1000000,
-  KEYZOS = 0x2000000,
-  KEYNOZOS = 0x4000000,
-  KEYHLSL = 0x8000000,
-  KEYFIXEDPOINT = 0x10000000,
-  KEYMAX = KEYFIXEDPOINT, // The maximum key
-  KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
-  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
-           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
-};
-
-/// How a keyword is treated in the selected standard. This enum is ordered
-/// intentionally so that the value that 'wins' is the most 'permissive'.
-enum KeywordStatus {
-  KS_Unknown,   // Not yet calculated. Used when figuring out the status.
-  KS_Disabled,  // Disabled
-  KS_Future,    // Is a keyword in future standard
-  KS_Extension, // Is an extension
-  KS_Enabled,   // Enabled
-};
-
-} // namespace
-
 // This works on a single TokenKey flag and checks the LangOpts to get the
 // KeywordStatus based exclusively on this flag, so that it can be merged in
 // getKeywordStatus. Most should be enabled/disabled, but some might imply
@@ -220,9 +169,7 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
   }
 }
 
-/// Translates flags as specified in TokenKinds.def into keyword status
-/// in the given language standard.
-static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
+KeywordStatus clang::getKeywordStatus(const LangOptions &LangOpts,
                                       unsigned Flags) {
   // KEYALL means always enabled, so special case this one.
   if (Flags == KEYALL) return KS_Enabled;
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 7dc81c50f87a2..b6cc6ec9365f5 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -1159,17 +1159,11 @@ static bool isInvalid(LocType Loc, bool *Invalid) {
   return MyInvalid;
 }
 
-unsigned SourceManager::getSpellingColumnNumber(SourceLocation Loc,
-                                                bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
-  return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
-}
-
-unsigned SourceManager::getExpansionColumnNumber(SourceLocation Loc,
-                                                 bool *Invalid) const {
+unsigned SourceManager::getColumnNumber(SourceLocation Loc,
+                                        bool *Invalid) const {
+  assert(Loc.isFileID());
   if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
   return getColumnNumber(LocInfo.first, LocInfo.second, Invalid);
 }
 
@@ -1367,18 +1361,13 @@ unsigned SourceManager::getLineNumber(FileID FID, unsigned FilePos,
   return LineNo;
 }
 
-unsigned SourceManager::getSpellingLineNumber(SourceLocation Loc,
-                                              bool *Invalid) const {
-  if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedSpellingLoc(Loc);
-  return getLineNumber(LocInfo.first, LocInfo.second);
-}
-unsigned SourceManager::getExpansionLineNumber(SourceLocation Loc,
-                                               bool *Invalid) const {
+unsigned SourceManager::getLineNumber(SourceLocation Loc, bool *Invalid) const {
+  assert(Loc.isFileID());
   if (isInvalid(Loc, Invalid)) return 0;
-  FileIDAndOffset LocInfo = getDecomposedExpansionLoc(Loc);
+  FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
   return getLineNumber(LocInfo.first, LocInfo.second);
 }
+
 unsigned SourceManager::getPresumedLineNumber(SourceLocation Loc,
                                               bool *Invalid) const {
   PresumedLoc PLoc = getPresumedLoc(Loc);
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index ac9c81d4416c9..d31d656a63fc5 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -285,7 +285,8 @@ class LineJoiner {
       if (Tok && Tok->is(tok::kw_typedef))
         Tok = Tok->getNextNonComment();
       if (Tok && Tok->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union,
-                              tok::kw_extern, Keywords.kw_interface)) {
+                              tok::kw_extern, Keywords.kw_interface,
+                              Keywords.kw_record)) {
         return !Style.BraceWrapping.SplitEmptyRecord && EmptyBlock
                    ? tryMergeSimpleBlock(I, E, Limit)
                    : 0;
@@ -498,7 +499,8 @@ class LineJoiner {
         ShouldMerge = Style.AllowShortEnumsOnASingleLine;
       } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) {
         ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine;
-      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace)) {
+      } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace,
+                                        TT_RecordLBrace)) {
         // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes
         // and structs, but it seems that wrapping is still handled correctly
         // elsewhere.
@@ -507,7 +509,7 @@ class LineJoiner {
                        !Style.BraceWrapping.SplitEmptyRecord);
       } else if (TheLine->InPPDirective ||
                  TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum,
-                                          tok::kw_struct)) {
+                                          tok::kw_struct, Keywords.kw_record)) {
         // Try to merge a block with left brace unwrapped that wasn't yet
         // covered.
         ShouldMerge = !Style.BraceWrapping.AfterFunction ||
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 5e2584edac8f4..8b7dd02d548af 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -948,7 +948,11 @@ static bool isIIFE(const UnwrappedLine &Line,
 }
 
 static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
-                                   const FormatToken &InitialToken) {
+                                   const FormatToken &InitialToken,
+                                   const bool IsJavaRecord) {
+  if (IsJavaRecord)
+    return Style.BraceWrapping.AfterClass;
+
   tok::TokenKind Kind = InitialToken.Tok.getKind();
   if (InitialToken.is(TT_NamespaceMacro))
     Kind = tok::kw_namespace;
@@ -3200,7 +3204,7 @@ void UnwrappedLineParser::parseNamespace() {
   if (FormatTok->is(tok::l_brace)) {
     FormatTok->setFinalizedType(TT_NamespaceLBrace);
 
-    if (ShouldBreakBeforeBrace(Style, InitialToken))
+    if (ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false))
       addUnwrappedLine();
 
     unsigned AddLevels =
@@ -3865,7 +3869,7 @@ bool UnwrappedLineParser::parseEnum() {
   }
 
   if (!Style.AllowShortEnumsOnASingleLine &&
-      ShouldBreakBeforeBrace(Style, InitialToken)) {
+      ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false)) {
     addUnwrappedLine();
   }
   // Parse enum body.
@@ -4160,7 +4164,7 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr, bool IsJavaRecord) {
     if (ParseAsExpr) {
       parseChildBlock();
     } else {
-      if (ShouldBreakBeforeBrace(Style, InitialToken))
+      if (ShouldBreakBeforeBrace(Style, InitialToken, IsJavaRecord))
         addUnwrappedLine();
 
       unsigned AddLevels = Style.IndentAccessModifiers ? 2u : 1u;
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 4b63eb7df1054..12cc02965e7d3 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -1302,6 +1302,14 @@ class Analyzer {
       return true;
     }
 
+    bool TraverseCXXRecordDecl(CXXRecordDecl *D) override {
+      // Completely skip local struct/class/union declarations since their
+      // methods would otherwise be incorrectly interpreted as part of the
+      // function we are currently traversing. The initial Sema pass will have
+      // already recorded any nonblocking methods needing analysis.
+      return true;
+    }
+
     bool TraverseConstructorInitializer(CXXCtorInitializer *Init) override {
       ViolationSite PrevVS = VSite;
       if (Init->isAnyMemberInitializer())
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index 881e816292d59..012c017798a1f 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -104,6 +104,25 @@ void nb8c()
 	};
 }
 
+void nb8d() [[clang::nonblocking]]
+{
+	// Blocking methods of a local CXXRecordDecl do not generate diagnostics
+	// for the outer function.
+	struct F1 {
+        void method() { void* ptr = new int; }
+	};
+
+	// Skipping the CXXRecordDecl does not skip a following VarDecl.
+	struct F2 {
+        F2() { void* ptr = new int; } // expected-note {{constructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
+	} f2; // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' constructor 'nb8d()::F2::F2'}}
+
+	// Nonblocking methods of a local CXXRecordDecl are verified independently.
+	struct F3 {
+		void method() [[clang::nonblocking]] { void* ptr = new int; }// expected-warning {{function with 'nonblocking' attribute must not allocate or deallocate memory}}
+	};
+}
+
 // Make sure template expansions are found and verified.
 	template <typename T>
 	struct Adder {
diff --git a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
index 8ee64d486f4f4..fea86162c801d 100644
--- a/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
+++ b/clang/test/SemaHIP/builtins-amdgcn-raw-buffer-atomic-add.hip
@@ -14,5 +14,9 @@ __device__ void test_raw_ptr_atomics(__amdgpu_buffer_rsrc_t rsrc, int i32, float
 __device__ void test_raw_ptr_atomics_err(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
   i32 = __builtin_amdgcn_raw_ptr_buffer_atomic_add_i32(i32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
   f32 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_f32(f32, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
-  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4);
+  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0, 4); // expected-error{{too many arguments to function call}}
+}
+
+__device__ void test_raw_ptr_atomics_f16_retty(__amdgpu_buffer_rsrc_t rsrc, int i32, float f32, float16x2_t v2f16, int offset, int soffset) {
+  v2f16 = __builtin_amdgcn_raw_ptr_buffer_atomic_fadd_v2f16(v2f16, rsrc, offset, soffset, 0);
 }
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 1416614bae29a..3cc97e2dc0b2e 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -848,6 +848,19 @@ TEST_F(FormatTestJava, TextBlock) {
                  "              Pat Q. Smith");
 }
 
+TEST_F(FormatTestJava, BreakAfterRecord) {
+  auto Style = getLLVMStyle(FormatStyle::LK_Java);
+  Style.EmptyLineBeforeAccessModifier = FormatStyle::ELBAMS_Never;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  Style.BraceWrapping.SplitEmptyRecord = true;
+
+  verifyFormat("public record Foo(int i)\n"
+               "{\n"
+               "}",
+               "public record Foo(int i) {}", Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index ef58da61e371b..bb55a8163d938 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -348,6 +348,7 @@ struct NodeVisitor {
   READ_FEATURE(TeamValue)
   READ_FEATURE(ImageSelector)
   READ_FEATURE(ImageSelectorSpec)
+  READ_FEATURE(ImageSelectorSpec::Notify)
   READ_FEATURE(ImageSelectorSpec::Stat)
   READ_FEATURE(ImageSelectorSpec::Team_Number)
   READ_FEATURE(ImplicitPart)
diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h
index 48aafa8982559..d63c16f93230a 100644
--- a/flang/include/flang/Evaluate/traverse.h
+++ b/flang/include/flang/Evaluate/traverse.h
@@ -146,7 +146,7 @@ class Traverse {
     return Combine(x.base(), x.subscript());
   }
   Result operator()(const CoarrayRef &x) const {
-    return Combine(x.base(), x.cosubscript(), x.stat(), x.team());
+    return Combine(x.base(), x.cosubscript(), x.notify(), x.stat(), x.team());
   }
   Result operator()(const DataRef &x) const { return visitor_(x.u); }
   Result operator()(const Substring &x) const {
diff --git a/flang/include/flang/Evaluate/variable.h b/flang/include/flang/Evaluate/variable.h
index 5c14421fd3a1b..4f64ede3d407d 100644
--- a/flang/include/flang/Evaluate/variable.h
+++ b/flang/include/flang/Evaluate/variable.h
@@ -260,6 +260,9 @@ class CoarrayRef {
   // it's TEAM=.
   std::optional<Expr<SomeType>> team() const;
   CoarrayRef &set_team(Expr<SomeType> &&);
+  // When notify() is Expr<Some>, it's NOTIFY=.
+  std::optional<Expr<SomeType>> notify() const;
+  CoarrayRef &set_notify(Expr<SomeType> &&);
 
   int Rank() const;
   int Corank() const { return 0; }
@@ -272,6 +275,7 @@ class CoarrayRef {
 private:
   common::CopyableIndirection<DataRef> base_;
   std::vector<Expr<SubscriptInteger>> cosubscript_;
+  std::optional<common::CopyableIndirection<Expr<SomeType>>> notify_;
   std::optional<common::CopyableIndirection<Expr<SomeInteger>>> stat_;
   std::optional<common::CopyableIndirection<Expr<SomeType>>> team_;
 };
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index de2716410d6cd..b2424023b0168 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -387,6 +387,7 @@ class ParseTreeDumper {
   NODE(parser, TeamValue)
   NODE(parser, ImageSelector)
   NODE(parser, ImageSelectorSpec)
+  NODE(ImageSelectorSpec, Notify)
   NODE(ImageSelectorSpec, Stat)
   NODE(ImageSelectorSpec, Team_Number)
   NODE(parser, ImplicitPart)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 8c7578f7a1941..32e444fbb2e6c 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -1684,13 +1684,15 @@ using Cosubscript = ScalarIntExpr;
 WRAPPER_CLASS(TeamValue, Scalar<common::Indirection<Expr>>);
 
 // R926 image-selector-spec ->
+//        NOTIFY = notify-variable |
 //        STAT = stat-variable | TEAM = team-value |
 //        TEAM_NUMBER = scalar-int-expr
 struct ImageSelectorSpec {
   WRAPPER_CLASS(Stat, Scalar<Integer<common::Indirection<Variable>>>);
   WRAPPER_CLASS(Team_Number, ScalarIntExpr);
+  WRAPPER_CLASS(Notify, Scalar<common::Indirection<Variable>>);
   UNION_CLASS_BOILERPLATE(ImageSelectorSpec);
-  std::variant<Stat, TeamValue, Team_Number> u;
+  std::variant<Notify, Stat, TeamValue, Team_Number> u;
 };
 
 // R924 image-selector ->
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 8a7b9867c0979..1c3477013b559 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -107,6 +107,7 @@ bool IsBindCProcedure(const Scope &);
 // Returns a pointer to the function's symbol when true, else null
 const Symbol *IsFunctionResultWithSameNameAsFunction(const Symbol &);
 bool IsOrContainsEventOrLockComponent(const Symbol &);
+bool IsOrContainsNotifyComponent(const Symbol &);
 bool CanBeTypeBoundProc(const Symbol &);
 // Does a non-PARAMETER symbol have explicit initialization with =value or
 // =>target in its declaration (but not in a DATA statement)? (Being
@@ -652,6 +653,8 @@ using PotentialAndPointerComponentIterator =
 // dereferenced.
 PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
     const DerivedTypeSpec &, bool ignoreCoarrays = false);
+PotentialComponentIterator::const_iterator FindNotifyPotentialComponent(
+    const DerivedTypeSpec &, bool ignoreCoarrays = false);
 PotentialComponentIterator::const_iterator FindCoarrayPotentialComponent(
     const DerivedTypeSpec &);
 PotentialAndPointerComponentIterator::const_iterator
diff --git a/flang/lib/Evaluate/variable.cpp b/flang/lib/Evaluate/variable.cpp
index b9b34d4d5bc89..b257dad42fc58 100644
--- a/flang/lib/Evaluate/variable.cpp
+++ b/flang/lib/Evaluate/variable.cpp
@@ -89,6 +89,14 @@ std::optional<Expr<SomeType>> CoarrayRef::team() const {
   }
 }
 
+std::optional<Expr<SomeType>> CoarrayRef::notify() const {
+  if (notify_) {
+    return notify_.value().value();
+  } else {
+    return std::nullopt;
+  }
+}
+
 CoarrayRef &CoarrayRef::set_stat(Expr<SomeInteger> &&v) {
   CHECK(IsVariable(v));
   stat_.emplace(std::move(v));
@@ -100,6 +108,11 @@ CoarrayRef &CoarrayRef::set_team(Expr<SomeType> &&v) {
   return *this;
 }
 
+CoarrayRef &CoarrayRef::set_notify(Expr<SomeType> &&v) {
+  notify_.emplace(std::move(v));
+  return *this;
+}
+
 const Symbol &CoarrayRef::GetFirstSymbol() const {
   return base().GetFirstSymbol();
 }
diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp
index cb3090df25680..605264dfcbe85 100644
--- a/flang/lib/Lower/Support/Utils.cpp
+++ b/flang/lib/Lower/Support/Utils.cpp
@@ -84,7 +84,7 @@ class HashEvaluateExpr {
          x.cosubscript())
       cosubs -= getHashValue(v);
     return getHashValue(x.base()) * 97u - cosubs + getHashValue(x.stat()) +
-           257u + getHashValue(x.team());
+           257u + getHashValue(x.team()) + getHashValue(x.notify());
   }
   static unsigned getHashValue(const Fortran::evaluate::NamedEntity &x) {
     if (x.IsSymbol())
@@ -343,7 +343,8 @@ class IsEqualEvaluateExpr {
                       const Fortran::evaluate::CoarrayRef &y) {
     return isEqual(x.base(), y.base()) &&
            isEqual(x.cosubscript(), y.cosubscript()) &&
-           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team());
+           isEqual(x.stat(), y.stat()) && isEqual(x.team(), y.team()) &&
+           isEqual(x.notify(), y.notify());
   }
   static bool isEqual(const Fortran::evaluate::NamedEntity &x,
                       const Fortran::evaluate::NamedEntity &y) {
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index 59fe7d813d96a..ea6a1eada2741 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1212,12 +1212,15 @@ TYPE_CONTEXT_PARSER("image selector"_en_US,
 
 // R926 image-selector-spec ->
 //        STAT = stat-variable | TEAM = team-value |
-//        TEAM_NUMBER = scalar-int-expr
+//        TEAM_NUMBER = scalar-int-expr |
+//        NOTIFY = notify-variable
 TYPE_PARSER(construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Stat>(
                 "STAT =" >> scalar(integer(indirect(variable))))) ||
     construct<ImageSelectorSpec>(construct<TeamValue>("TEAM =" >> teamValue)) ||
     construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Team_Number>(
-        "TEAM_NUMBER =" >> scalarIntExpr)))
+        "TEAM_NUMBER =" >> scalarIntExpr)) ||
+    construct<ImageSelectorSpec>(construct<ImageSelectorSpec::Notify>(
+        "NOTIFY =" >> scalar(indirect(variable)))))
 
 // R927 allocate-stmt ->
 //        ALLOCATE ( [type-spec ::] allocation-list [, alloc-opt-list] )
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 84123030195e9..6bb14a43e7b99 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -819,6 +819,7 @@ class UnparseVisitor {
       Word("TEAM=");
     }
   }
+  void Before(const ImageSelectorSpec::Notify &) { Word("NOTIFY="); }
   void Unparse(const AllocateStmt &x) { // R927
     Word("ALLOCATE(");
     Walk(std::get<std::optional<TypeSpec>>(x.t), "::");
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index de407d3b1e125..9a6b3ff3cdc2c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -855,6 +855,15 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "Variable '%s' with EVENT_TYPE or LOCK_TYPE potential component '%s' must be a coarray"_err_en_US,
             symbol.name(), component.BuildResultDesignatorName());
+      } else if (IsNotifyType(derived)) { // C1612
+        messages_.Say(
+            "Variable '%s' with NOTIFY_TYPE must be a coarray"_err_en_US,
+            symbol.name());
+      } else if (auto component{FindNotifyPotentialComponent( // C1611
+                     *derived, /*ignoreCoarrays=*/true)}) {
+        messages_.Say(
+            "Variable '%s' with NOTIFY_TYPE potential component '%s' must be a coarray"_err_en_US,
+            symbol.name(), component.BuildResultDesignatorName());
       }
     }
   }
@@ -873,6 +882,10 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE"_err_en_US);
       }
+      if (IsOrContainsNotifyComponent(symbol)) { // C1613
+        messages_.Say(
+            "An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE"_err_en_US);
+      }
       if (IsAssumedSizeArray(symbol)) { // C834
         if (type && type->IsPolymorphic()) {
           messages_.Say(
diff --git a/flang/lib/Semantics/dump-expr.cpp b/flang/lib/Semantics/dump-expr.cpp
index 66cedab94bfb4..8d354cf65b61e 100644
--- a/flang/lib/Semantics/dump-expr.cpp
+++ b/flang/lib/Semantics/dump-expr.cpp
@@ -23,6 +23,7 @@ void DumpEvaluateExpr::Show(const evaluate::CoarrayRef &x) {
   Indent("coarray ref");
   Show(x.base());
   Show(x.cosubscript());
+  Show(x.notify());
   Show(x.stat());
   Show(x.team());
   Outdent();
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index c8167fd34f666..ac58dfc005f17 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1579,6 +1579,19 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::CoindexedNamedObject &x) {
         std::get<std::list<parser::ImageSelectorSpec>>(x.imageSelector.t)) {
       common::visit(
           common::visitors{
+              [&](const parser::ImageSelectorSpec::Notify &x) {
+                Analyze(x.v);
+                if (const auto *expr{GetExpr(context_, x.v)}) {
+                  if (coarrayRef.notify()) {
+                    Say("coindexed reference has multiple NOTIFY= specifiers"_err_en_US);
+                  } else if (auto dyType{expr->GetType()};
+                      dyType && IsNotifyType(GetDerivedTypeSpec(*dyType))) {
+                    coarrayRef.set_notify(Expr<SomeType>{*expr});
+                  } else {
+                    Say("NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV"_err_en_US);
+                  }
+                }
+              },
               [&](const parser::ImageSelectorSpec::Stat &x) {
                 Analyze(x.v);
                 if (const auto *expr{GetExpr(context_, x.v)}) {
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 8eddd03faa962..cf1e5e7d44565 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -582,6 +582,18 @@ bool IsOrContainsEventOrLockComponent(const Symbol &original) {
   return false;
 }
 
+bool IsOrContainsNotifyComponent(const Symbol &original) {
+  const Symbol &symbol{ResolveAssociations(original, /*stopAtTypeGuard=*/true)};
+  if (evaluate::IsVariable(symbol)) {
+    if (const DeclTypeSpec *type{symbol.GetType()}) {
+      if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+        return IsNotifyType(derived) || FindNotifyPotentialComponent(*derived);
+      }
+    }
+  }
+  return false;
+}
+
 // Check this symbol suitable as a type-bound procedure - C769
 bool CanBeTypeBoundProc(const Symbol &symbol) {
   if (IsDummy(symbol) || IsProcedurePointer(symbol)) {
@@ -1489,6 +1501,32 @@ PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
   return iter;
 }
 
+PotentialComponentIterator::const_iterator FindNotifyPotentialComponent(
+    const DerivedTypeSpec &derived, bool ignoreCoarrays) {
+  PotentialComponentIterator potentials{derived};
+  auto iter{potentials.begin()};
+  for (auto end{potentials.end()}; iter != end; ++iter) {
+    const Symbol &component{*iter};
+    if (const auto *object{component.detailsIf<ObjectEntityDetails>()}) {
+      if (const DeclTypeSpec *type{object->type()}) {
+        if (IsNotifyType(type->AsDerived())) {
+          if (!ignoreCoarrays) {
+            break; // found one
+          }
+          auto path{iter.GetComponentPath()};
+          path.pop_back();
+          if (std::find_if(path.begin(), path.end(), [](const Symbol &sym) {
+                return evaluate::IsCoarray(sym);
+              }) == path.end()) {
+            break; // found one not in a coarray
+          }
+        }
+      }
+    }
+  }
+  return iter;
+}
+
 UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent(
     const DerivedTypeSpec &derived) {
   UltimateComponentIterator ultimates{derived};
diff --git a/flang/test/Semantics/coarrays02.f90 b/flang/test/Semantics/coarrays02.f90
index b16e0ccb58797..e866dd89c07ab 100644
--- a/flang/test/Semantics/coarrays02.f90
+++ b/flang/test/Semantics/coarrays02.f90
@@ -16,6 +16,8 @@ program main
   type(event_type) event
   !ERROR: Variable 'lock' with EVENT_TYPE or LOCK_TYPE must be a coarray
   type(lock_type) lock
+  !ERROR: Variable 'notify' with NOTIFY_TYPE must be a coarray
+  type(notify_type) notify
   integer :: local[*] ! ok in main
 end
 
@@ -120,3 +122,18 @@ subroutine s4
   !ERROR: Subscripts must appear in a coindexed reference when its base is an array
   print *, ta(1)%a[1]
 end
+
+subroutine s5(a, notify, res)
+  use iso_fortran_env
+  type t
+    type(notify_type) :: a
+  end type
+  real, intent(in) :: a[*]
+  type(event_type), intent(in) :: notify[*]
+  !ERROR: An INTENT(OUT) dummy argument may not be, or contain, NOTIFY_TYPE
+  type(notify_type), intent(out) :: res[*]
+  !ERROR: Variable 'bad' with NOTIFY_TYPE potential component '%a' must be a coarray
+  type(t) :: bad
+  !ERROR: NOTIFY= specifier must have type NOTIFY_TYPE from ISO_FORTRAN_ENV
+  print *, a[1, NOTIFY=notify]
+end
diff --git a/flang/test/Semantics/notifywait03.f90 b/flang/test/Semantics/notifywait03.f90
index 0fc56f66ad32d..a336a7a67669a 100644
--- a/flang/test/Semantics/notifywait03.f90
+++ b/flang/test/Semantics/notifywait03.f90
@@ -10,6 +10,7 @@ program test_notify_wait
   implicit none
 
   ! notify_type variables must be coarrays
+  !ERROR: Variable 'non_coarray' with NOTIFY_TYPE must be a coarray
   type(notify_type) :: non_coarray
 
   type(notify_type) :: notify_var[*], notify_array(2)[*]
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 6697ce5b03851..e1b12e3010fe9 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -42,14 +42,37 @@
 #define FP_LLOGBNAN LONG_MAX
 #endif
 
-#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__)
-#define math_errhandling 0
-#elif defined(__NO_MATH_ERRNO__)
-#define math_errhandling (MATH_ERREXCEPT)
+// Math error handling. Target support is assumed to be existent unless
+// explicitly disabled.
+#if defined(__NVPTX__) || defined(__AMDGPU__) || defined(__FAST_MATH__) ||     \
+    defined(__NO_MATH_ERRNO__)
+#define __LIBC_SUPPORTS_MATH_ERRNO 0
+#else
+#define __LIBC_SUPPORTS_MATH_ERRNO 1
+#endif
+
+#if defined(__FAST_MATH__) ||                                                  \
+    ((defined(__arm__) || defined(_M_ARM) || defined(__thumb__) ||             \
+      defined(__aarch64__) || defined(_M_ARM64)) &&                            \
+     !defined(__ARM_FP))
+#define __LIBC_SUPPORTS_MATH_ERREXCEPT 0
 #else
+#define __LIBC_SUPPORTS_MATH_ERREXCEPT 1
+#endif
+
+#if __LIBC_SUPPORTS_MATH_ERRNO && __LIBC_SUPPORTS_MATH_ERREXCEPT
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
+#elif __LIBC_SUPPORTS_MATH_ERRNO
+#define math_errhandling (MATH_ERRNO)
+#elif __LIBC_SUPPORTS_MATH_ERREXCEPT
+#define math_errhandling (MATH_ERREXCEPT)
+#else
+#define math_errhandling 0
 #endif
 
+#undef __LIBC_SUPPORTS_MATH_ERRNO
+#undef __LIBC_SUPPORTS_MATH_ERREXCEPT
+
 // POSIX math constants
 // https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html
 #define M_E (__extension__ 0x1.5bf0a8b145769p1)
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index b7af751ec3f27..96874702b1fdf 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -161,6 +161,7 @@ add_header_library(
   HDRS
     wctype_utils.h
   DEPENDS
+    libc.hdr.types.wchar_t
     libc.hdr.types.wint_t
 )
 
diff --git a/libc/src/__support/ctype_utils.h b/libc/src/__support/ctype_utils.h
index be0f25330af9e..61b7a0aeb5b67 100644
--- a/libc/src/__support/ctype_utils.h
+++ b/libc/src/__support/ctype_utils.h
@@ -27,7 +27,7 @@ namespace internal {
 // as well as a way to support non-ASCII character encodings.
 
 // Similarly, do not change these functions to use case ranges. e.g.
-//  bool islower(int ch) {
+//  bool islower(char ch) {
 //    switch(ch) {
 //    case 'a'...'z':
 //      return true;
@@ -37,7 +37,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool islower(int ch) {
+LIBC_INLINE static constexpr bool islower(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -71,7 +71,7 @@ LIBC_INLINE static constexpr bool islower(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isupper(int ch) {
+LIBC_INLINE static constexpr bool isupper(char ch) {
   switch (ch) {
   case 'A':
   case 'B':
@@ -105,7 +105,7 @@ LIBC_INLINE static constexpr bool isupper(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isdigit(int ch) {
+LIBC_INLINE static constexpr bool isdigit(char ch) {
   switch (ch) {
   case '0':
   case '1':
@@ -123,7 +123,7 @@ LIBC_INLINE static constexpr bool isdigit(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int tolower(int ch) {
+LIBC_INLINE static constexpr char tolower(char ch) {
   switch (ch) {
   case 'A':
     return 'a';
@@ -182,7 +182,7 @@ LIBC_INLINE static constexpr int tolower(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int toupper(int ch) {
+LIBC_INLINE static constexpr char toupper(char ch) {
   switch (ch) {
   case 'a':
     return 'A';
@@ -241,7 +241,7 @@ LIBC_INLINE static constexpr int toupper(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isalpha(int ch) {
+LIBC_INLINE static constexpr bool isalpha(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -301,7 +301,7 @@ LIBC_INLINE static constexpr bool isalpha(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr bool isalnum(int ch) {
+LIBC_INLINE static constexpr bool isalnum(char ch) {
   switch (ch) {
   case 'a':
   case 'b':
@@ -371,7 +371,7 @@ LIBC_INLINE static constexpr bool isalnum(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
+LIBC_INLINE static constexpr int b36_char_to_int(char ch) {
   switch (ch) {
   case '0':
     return 0;
@@ -476,7 +476,7 @@ LIBC_INLINE static constexpr int b36_char_to_int(int ch) {
   }
 }
 
-LIBC_INLINE static constexpr int int_to_b36_char(int num) {
+LIBC_INLINE static constexpr char int_to_b36_char(int num) {
   // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
   // which depends on this.
 
@@ -559,7 +559,7 @@ LIBC_INLINE static constexpr int int_to_b36_char(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool isspace(int ch) {
+LIBC_INLINE static constexpr bool isspace(char ch) {
   switch (ch) {
   case ' ':
   case '\t':
@@ -574,7 +574,7 @@ LIBC_INLINE static constexpr bool isspace(int ch) {
 }
 
 // not yet encoding independent.
-LIBC_INLINE static constexpr bool isgraph(int ch) {
+LIBC_INLINE static constexpr bool isgraph(char ch) {
   return 0x20 < ch && ch < 0x7f;
 }
 
diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index 29449bd739730..5e7369de00962 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -378,9 +378,8 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
     using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
-      const int result = internal::int_to_b36_char(digit);
-      return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result)
-                                                 : result);
+      const char result = internal::int_to_b36_char(digit);
+      return Fmt::IS_UPPERCASE ? internal::toupper(result) : result;
     }
 
     LIBC_INLINE static void
diff --git a/libc/src/__support/wctype_utils.h b/libc/src/__support/wctype_utils.h
index 2ae5ec93b2a63..60b6afb928475 100644
--- a/libc/src/__support/wctype_utils.h
+++ b/libc/src/__support/wctype_utils.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
 #define LLVM_LIBC_SRC___SUPPORT_WCTYPE_UTILS_H
 
+#include "hdr/types/wchar_t.h"
 #include "hdr/types/wint_t.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
@@ -30,7 +31,7 @@ namespace internal {
 
 // Similarly, do not change these fumarks to show your new solution is faster,
 // as well as a way to support non-Anctions to use case ranges. e.g.
-//  bool iswlower(wint_t ch) {
+//  bool iswlower(wchar_t ch) {
 //    switch(ch) {
 //    case L'a'...L'z':
 //      return true;
@@ -40,7 +41,7 @@ namespace internal {
 // EBCDIC. Technically we could use some smaller ranges, but that's even harder
 // to read.
 
-LIBC_INLINE static constexpr bool iswlower(wint_t wch) {
+LIBC_INLINE static constexpr bool iswlower(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -74,7 +75,7 @@ LIBC_INLINE static constexpr bool iswlower(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswupper(wint_t wch) {
+LIBC_INLINE static constexpr bool iswupper(wchar_t wch) {
   switch (wch) {
   case L'A':
   case L'B':
@@ -108,7 +109,7 @@ LIBC_INLINE static constexpr bool iswupper(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswdigit(wint_t wch) {
+LIBC_INLINE static constexpr bool iswdigit(wchar_t wch) {
   switch (wch) {
   case L'0':
   case L'1':
@@ -126,7 +127,7 @@ LIBC_INLINE static constexpr bool iswdigit(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t towlower(wint_t wch) {
+LIBC_INLINE static constexpr wchar_t towlower(wchar_t wch) {
   switch (wch) {
   case L'A':
     return L'a';
@@ -185,7 +186,7 @@ LIBC_INLINE static constexpr wint_t towlower(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t towupper(wint_t wch) {
+LIBC_INLINE static constexpr wchar_t towupper(wchar_t wch) {
   switch (wch) {
   case L'a':
     return L'A';
@@ -244,7 +245,7 @@ LIBC_INLINE static constexpr wint_t towupper(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalpha(wint_t wch) {
+LIBC_INLINE static constexpr bool iswalpha(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -304,7 +305,7 @@ LIBC_INLINE static constexpr bool iswalpha(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswalnum(wint_t wch) {
+LIBC_INLINE static constexpr bool iswalnum(wchar_t wch) {
   switch (wch) {
   case L'a':
   case L'b':
@@ -374,7 +375,7 @@ LIBC_INLINE static constexpr bool iswalnum(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) {
+LIBC_INLINE static constexpr int b36_wchar_to_int(wchar_t wch) {
   switch (wch) {
   case L'0':
     return 0;
@@ -479,7 +480,7 @@ LIBC_INLINE static constexpr int b36_wchar_to_int(wint_t wch) {
   }
 }
 
-LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) {
+LIBC_INLINE static constexpr wchar_t int_to_b36_wchar(int num) {
   // Can't actually use LIBC_ASSERT here because it depends on integer_to_string
   // which depends on this.
 
@@ -562,7 +563,7 @@ LIBC_INLINE static constexpr wint_t int_to_b36_wchar(int num) {
   }
 }
 
-LIBC_INLINE static constexpr bool iswspace(wint_t wch) {
+LIBC_INLINE static constexpr bool iswspace(wchar_t wch) {
   switch (wch) {
   case L' ':
   case L'\t':
diff --git a/libc/src/ctype/CMakeLists.txt b/libc/src/ctype/CMakeLists.txt
index 8830c1bccf9ea..68e982bd4529e 100644
--- a/libc/src/ctype/CMakeLists.txt
+++ b/libc/src/ctype/CMakeLists.txt
@@ -6,6 +6,7 @@ add_entrypoint_object(
     isalnum.h
   DEPENDS
     libc.include.ctype
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -16,6 +17,7 @@ add_entrypoint_object(
   HDRS
     isalpha.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -50,6 +52,7 @@ add_entrypoint_object(
   HDRS
     isdigit.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -60,6 +63,7 @@ add_entrypoint_object(
   HDRS
     isgraph.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -70,6 +74,7 @@ add_entrypoint_object(
   HDRS
     islower.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -88,6 +93,7 @@ add_entrypoint_object(
   HDRS
     ispunct.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -97,6 +103,9 @@ add_entrypoint_object(
     isspace.cpp
   HDRS
     isspace.h
+  DEPENDS
+    libc.src.__support.CPP.limits
+    libc.src.__support.ctype_utils
 )
 
 add_entrypoint_object(
@@ -106,6 +115,7 @@ add_entrypoint_object(
   HDRS
     isupper.h
   DEPENDS
+    libc.src.__support.CPP.limits  
     libc.src.__support.ctype_utils
 )
 
@@ -116,6 +126,7 @@ add_entrypoint_object(
   HDRS
     isxdigit.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -126,6 +137,7 @@ add_entrypoint_object(
   HDRS
     tolower.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -144,6 +156,7 @@ add_entrypoint_object(
   HDRS
     toupper.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
 )
 
@@ -160,6 +173,7 @@ add_entrypoint_object(
     isalnum_l.h
   DEPENDS
     libc.include.ctype
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -171,6 +185,7 @@ add_entrypoint_object(
   HDRS
     isalpha_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -202,6 +217,7 @@ add_entrypoint_object(
   HDRS
     isdigit_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -224,6 +240,7 @@ add_entrypoint_object(
   HDRS
     islower_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -257,6 +274,8 @@ add_entrypoint_object(
     isspace_l.h
   DEPENDS
     libc.hdr.types.locale_t
+    libc.src.__support.CPP.limits
+    libc.src.__support.ctype_utils
 )
 
 add_entrypoint_object(
@@ -266,6 +285,7 @@ add_entrypoint_object(
   HDRS
     isupper_l.h
   DEPENDS
+    libc.src.__support.CPP.limits  
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -277,6 +297,7 @@ add_entrypoint_object(
   HDRS
     isxdigit_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -288,6 +309,7 @@ add_entrypoint_object(
   HDRS
     tolower_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
@@ -299,6 +321,7 @@ add_entrypoint_object(
   HDRS
     toupper_l.h
   DEPENDS
+    libc.src.__support.CPP.limits
     libc.src.__support.ctype_utils
     libc.hdr.types.locale_t
 )
diff --git a/libc/src/ctype/isalnum.cpp b/libc/src/ctype/isalnum.cpp
index 54a3e35748879..102b5e79e4a18 100644
--- a/libc/src/ctype/isalnum.cpp
+++ b/libc/src/ctype/isalnum.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isalnum.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalnum, (int c)) {
-  return static_cast<int>(internal::isalnum(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalnum(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalnum_l.cpp b/libc/src/ctype/isalnum_l.cpp
index 671d9b75c4c33..173e1c174121e 100644
--- a/libc/src/ctype/isalnum_l.cpp
+++ b/libc/src/ctype/isalnum_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isalnum_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalnum_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isalnum(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalnum(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalpha.cpp b/libc/src/ctype/isalpha.cpp
index 78b26f6a486ea..7c874bf373866 100644
--- a/libc/src/ctype/isalpha.cpp
+++ b/libc/src/ctype/isalpha.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isalpha.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalpha, (int c)) {
-  return static_cast<int>(internal::isalpha(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalpha(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isalpha_l.cpp b/libc/src/ctype/isalpha_l.cpp
index 0619d979bedf2..982bcc569faaf 100644
--- a/libc/src/ctype/isalpha_l.cpp
+++ b/libc/src/ctype/isalpha_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isalpha_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isalpha_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isalpha(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isalpha(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isdigit.cpp b/libc/src/ctype/isdigit.cpp
index 1f711943861f8..43553c794a2f3 100644
--- a/libc/src/ctype/isdigit.cpp
+++ b/libc/src/ctype/isdigit.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isdigit.h"
+
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -14,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isdigit, (int c)) {
-  return static_cast<int>(internal::isdigit(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isdigit(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isdigit_l.cpp b/libc/src/ctype/isdigit_l.cpp
index ca981362bfe83..40b5618906dac 100644
--- a/libc/src/ctype/isdigit_l.cpp
+++ b/libc/src/ctype/isdigit_l.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isdigit_l.h"
+
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -14,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isdigit_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isdigit(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isdigit(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isgraph.cpp b/libc/src/ctype/isgraph.cpp
index 74bb2e75d138e..b9308ecb7367c 100644
--- a/libc/src/ctype/isgraph.cpp
+++ b/libc/src/ctype/isgraph.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isgraph.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isgraph, (int c)) {
-  return static_cast<int>(internal::isgraph(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isgraph(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isgraph_l.cpp b/libc/src/ctype/isgraph_l.cpp
index cbef6df148aed..dddcb9be4f80c 100644
--- a/libc/src/ctype/isgraph_l.cpp
+++ b/libc/src/ctype/isgraph_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/isgraph_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isgraph_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isgraph(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isgraph(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/islower.cpp b/libc/src/ctype/islower.cpp
index 831aad32d3a22..920bfc1cc1a59 100644
--- a/libc/src/ctype/islower.cpp
+++ b/libc/src/ctype/islower.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/islower.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, islower, (int c)) {
-  return static_cast<int>(internal::islower(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::islower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/islower_l.cpp b/libc/src/ctype/islower_l.cpp
index b9be6acc81c99..da97026dc59a7 100644
--- a/libc/src/ctype/islower_l.cpp
+++ b/libc/src/ctype/islower_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/islower_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, islower_l, (int c, locale_t)) {
-  return static_cast<int>(internal::islower(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::islower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/ispunct.cpp b/libc/src/ctype/ispunct.cpp
index 0635294220b9c..4950036e9b81f 100644
--- a/libc/src/ctype/ispunct.cpp
+++ b/libc/src/ctype/ispunct.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/ispunct.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, ispunct, (int c)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch));
 }
 
diff --git a/libc/src/ctype/ispunct_l.cpp b/libc/src/ctype/ispunct_l.cpp
index e825fbe2001b0..79cd47b6a214d 100644
--- a/libc/src/ctype/ispunct_l.cpp
+++ b/libc/src/ctype/ispunct_l.cpp
@@ -8,6 +8,7 @@
 
 #include "src/ctype/ispunct_l.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
@@ -15,7 +16,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, ispunct_l, (int c, locale_t)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(!internal::isalnum(ch) && internal::isgraph(ch));
 }
 
diff --git a/libc/src/ctype/isspace.cpp b/libc/src/ctype/isspace.cpp
index 005bf460fc103..998dbf28f51d0 100644
--- a/libc/src/ctype/isspace.cpp
+++ b/libc/src/ctype/isspace.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isspace.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isspace, (int c)) {
-  return static_cast<int>(internal::isspace(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isspace(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isspace_l.cpp b/libc/src/ctype/isspace_l.cpp
index 5c46dd6805126..e40765326b35e 100644
--- a/libc/src/ctype/isspace_l.cpp
+++ b/libc/src/ctype/isspace_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isspace_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isspace_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isspace(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isspace(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isupper.cpp b/libc/src/ctype/isupper.cpp
index 965fa336b28b4..c5c3dbd5d7d4a 100644
--- a/libc/src/ctype/isupper.cpp
+++ b/libc/src/ctype/isupper.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isupper.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isupper, (int c)) {
-  return static_cast<int>(internal::isupper(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isupper_l.cpp b/libc/src/ctype/isupper_l.cpp
index 358990261d603..44ed9dab90a16 100644
--- a/libc/src/ctype/isupper_l.cpp
+++ b/libc/src/ctype/isupper_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isupper_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isupper_l, (int c, locale_t)) {
-  return static_cast<int>(internal::isupper(static_cast<unsigned>(c)));
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  return static_cast<int>(internal::isupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/isxdigit.cpp b/libc/src/ctype/isxdigit.cpp
index 81f645c6f49fc..1b2e71769b3f8 100644
--- a/libc/src/ctype/isxdigit.cpp
+++ b/libc/src/ctype/isxdigit.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isxdigit.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(internal::isalnum(ch) &&
                           internal::b36_char_to_int(ch) < 16);
 }
diff --git a/libc/src/ctype/isxdigit_l.cpp b/libc/src/ctype/isxdigit_l.cpp
index eddfd20a2da3b..e6150473b0043 100644
--- a/libc/src/ctype/isxdigit_l.cpp
+++ b/libc/src/ctype/isxdigit_l.cpp
@@ -7,15 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/isxdigit_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) {
-  const unsigned ch = static_cast<unsigned>(c);
+  if (c < 0 || c > cpp::numeric_limits<unsigned char>::max())
+    return 0;
+  const char ch = static_cast<char>(c);
   return static_cast<int>(internal::isalnum(ch) &&
                           internal::b36_char_to_int(ch) < 16);
 }
diff --git a/libc/src/ctype/tolower.cpp b/libc/src/ctype/tolower.cpp
index 3ecad7bc5d5d5..b45c5f2688a61 100644
--- a/libc/src/ctype/tolower.cpp
+++ b/libc/src/ctype/tolower.cpp
@@ -7,13 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/tolower.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, tolower, (int c)) { return internal::tolower(c); }
+LLVM_LIBC_FUNCTION(int, tolower, (int c)) {
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::tolower(static_cast<char>(c)));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/tolower_l.cpp b/libc/src/ctype/tolower_l.cpp
index 7ccf31617e592..049e46aea13c0 100644
--- a/libc/src/ctype/tolower_l.cpp
+++ b/libc/src/ctype/tolower_l.cpp
@@ -7,15 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/tolower_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, tolower_l, (int c, locale_t)) {
-  return internal::tolower(c);
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::tolower(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper.cpp b/libc/src/ctype/toupper.cpp
index 1e1e8fc400711..0e387238ce3b6 100644
--- a/libc/src/ctype/toupper.cpp
+++ b/libc/src/ctype/toupper.cpp
@@ -7,13 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/toupper.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, toupper, (int c)) { return internal::toupper(c); }
+LLVM_LIBC_FUNCTION(int, toupper, (int c)) {
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::toupper(static_cast<char>(c)));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/ctype/toupper_l.cpp b/libc/src/ctype/toupper_l.cpp
index a435ca1ab5d41..d1dff262c9377 100644
--- a/libc/src/ctype/toupper_l.cpp
+++ b/libc/src/ctype/toupper_l.cpp
@@ -7,15 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/ctype/toupper_l.h"
-#include "src/__support/ctype_utils.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/ctype_utils.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) {
-  return internal::toupper(c);
+  if (c < cpp::numeric_limits<char>::min() ||
+      c > cpp::numeric_limits<char>::max()) {
+    return c;
+  }
+  return static_cast<int>(internal::toupper(static_cast<char>(c)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/float_dec_converter_limited.h b/libc/src/stdio/printf_core/float_dec_converter_limited.h
index 9cdc13573d320..0f85d0a8d26b4 100644
--- a/libc/src/stdio/printf_core/float_dec_converter_limited.h
+++ b/libc/src/stdio/printf_core/float_dec_converter_limited.h
@@ -363,8 +363,8 @@ DigitsOutput decimal_digits(DigitsInput input, int precision, bool e_mode) {
       // we made it from and doing the decimal conversion all over again.)
       for (size_t i = output.ndigits; i-- > 0;) {
         if (output.digits[i] != '9') {
-          output.digits[i] = static_cast<char>(internal::int_to_b36_char(
-              internal::b36_char_to_int(output.digits[i]) + 1));
+          output.digits[i] = internal::int_to_b36_char(
+              internal::b36_char_to_int(output.digits[i]) + 1);
           break;
         } else {
           output.digits[i] = '0';
diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h
index 16592e7bac932..9b57f1d803e74 100644
--- a/libc/src/stdio/printf_core/float_hex_converter.h
+++ b/libc/src/stdio/printf_core/float_hex_converter.h
@@ -137,9 +137,9 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer,
   size_t first_non_zero = 1;
   for (; mant_cur > 0; --mant_cur, mantissa >>= 4) {
     char mant_mod_16 = static_cast<char>(mantissa % 16);
-    char new_digit = static_cast<char>(internal::int_to_b36_char(mant_mod_16));
+    char new_digit = internal::int_to_b36_char(mant_mod_16);
     if (internal::isupper(to_conv.conv_name))
-      new_digit = static_cast<char>(internal::toupper(new_digit));
+      new_digit = internal::toupper(new_digit);
     mant_buffer[mant_cur - 1] = new_digit;
     if (new_digit != '0' && first_non_zero < mant_cur)
       first_non_zero = mant_cur;
@@ -167,8 +167,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer<write_mode> *writer,
 
   size_t exp_cur = EXP_LEN;
   for (; exponent > 0; --exp_cur, exponent /= 10) {
-    exp_buffer[exp_cur - 1] =
-        static_cast<char>(internal::int_to_b36_char(exponent % 10));
+    exp_buffer[exp_cur - 1] = internal::int_to_b36_char(exponent % 10);
   }
   if (exp_cur == EXP_LEN) { // if nothing else was written, write a 0.
     exp_buffer[EXP_LEN - 1] = '0';
diff --git a/libc/src/stdlib/l64a.cpp b/libc/src/stdlib/l64a.cpp
index d59e65e7dc4c2..d8fe8ef86bf7d 100644
--- a/libc/src/stdlib/l64a.cpp
+++ b/libc/src/stdlib/l64a.cpp
@@ -32,15 +32,13 @@ constexpr static char b64_int_to_char(uint32_t num) {
   if (num == 1)
     return '/';
   if (num < 38)
-    return static_cast<char>(
-        internal::toupper(internal::int_to_b36_char(num - 2)));
+    return internal::toupper(internal::int_to_b36_char(num - 2));
 
   // this tolower is technically unnecessary, but it provides safety if we
   // change the default behavior of int_to_b36_char. Also the compiler
   // completely elides it so there's no performance penalty, see:
   // https://godbolt.org/z/o5ennv7fc
-  return static_cast<char>(
-      internal::tolower(internal::int_to_b36_char(num - 2 - 26)));
+  return internal::tolower(internal::int_to_b36_char(num - 2 - 26));
 }
 
 // This function takes a long and converts the low 32 bits of it into at most 6
diff --git a/libc/src/string/strcasestr.cpp b/libc/src/string/strcasestr.cpp
index de8e4bec7fe0b..575d6bed16d11 100644
--- a/libc/src/string/strcasestr.cpp
+++ b/libc/src/string/strcasestr.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, strcasestr,
                    (const char *haystack, const char *needle)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
 
   LIBC_CRASH_ON_NULLPTR(haystack);
diff --git a/libc/src/strings/strcasecmp.cpp b/libc/src/strings/strcasecmp.cpp
index 4bbe2909df1e2..4518647deabe4 100644
--- a/libc/src/strings/strcasecmp.cpp
+++ b/libc/src/strings/strcasecmp.cpp
@@ -17,8 +17,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, strcasecmp, (const char *left, const char *right)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strcmp(left, right, case_cmp);
 }
diff --git a/libc/src/strings/strcasecmp_l.cpp b/libc/src/strings/strcasecmp_l.cpp
index 95117cb27a564..d77f95637a396 100644
--- a/libc/src/strings/strcasecmp_l.cpp
+++ b/libc/src/strings/strcasecmp_l.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strcasecmp_l,
                    (const char *left, const char *right, locale_t)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strcmp(left, right, case_cmp);
 }
diff --git a/libc/src/strings/strncasecmp.cpp b/libc/src/strings/strncasecmp.cpp
index 9c2f0ab131269..a5926495a3e22 100644
--- a/libc/src/strings/strncasecmp.cpp
+++ b/libc/src/strings/strncasecmp.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strncasecmp,
                    (const char *left, const char *right, size_t n)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strncmp(left, right, n, case_cmp);
 }
diff --git a/libc/src/strings/strncasecmp_l.cpp b/libc/src/strings/strncasecmp_l.cpp
index 91ac7e5e89107..a828f609fd9e8 100644
--- a/libc/src/strings/strncasecmp_l.cpp
+++ b/libc/src/strings/strncasecmp_l.cpp
@@ -18,8 +18,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, strncasecmp_l,
                    (const char *left, const char *right, size_t n, locale_t)) {
   auto case_cmp = [](char a, char b) {
-    return LIBC_NAMESPACE::internal::tolower(a) -
-           LIBC_NAMESPACE::internal::tolower(b);
+    return static_cast<int>(LIBC_NAMESPACE::internal::tolower(a)) -
+           static_cast<int>(LIBC_NAMESPACE::internal::tolower(b));
   };
   return inline_strncmp(left, right, n, case_cmp);
 }
diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp
index 09f55d391dbff..e151363b88d0b 100644
--- a/libc/src/wctype/iswalpha.cpp
+++ b/libc/src/wctype/iswalpha.cpp
@@ -14,6 +14,8 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) { return internal::iswalpha(c); }
+LLVM_LIBC_FUNCTION(int, iswalpha, (wint_t c)) {
+  return internal::iswalpha(static_cast<wchar_t>(c));
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt
index 235e9fe2f55ee..d0752ea178429 100644
--- a/libc/test/IntegrationTest/CMakeLists.txt
+++ b/libc/test/IntegrationTest/CMakeLists.txt
@@ -14,5 +14,6 @@ add_object_library(
     libc.hdr.stdint_proxy
     libc.src.__support.OSUtil.osutil
     libc.src.__support.CPP.atomic
+    libc.src.__support.macros.properties.architectures
     ${arch_specific_deps}
 )
diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h
index 4a03f7aa6318b..9f5a3dfb3583c 100644
--- a/libc/test/IntegrationTest/test.h
+++ b/libc/test/IntegrationTest/test.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/OSUtil/exit.h"
 #include "src/__support/OSUtil/io.h"
+#include "src/__support/macros/properties/architectures.h"
 
 #define __AS_STRING(val) #val
 #define __CHECK_TRUE(file, line, val, should_exit)                             \
@@ -68,9 +69,15 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Errno checks.
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+#define ASSERT_ERRNO_EQ(VAL)
+#define ASSERT_ERRNO_SUCCESS()
+#define ASSERT_ERRNO_FAILURE()
+#else
 #define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(errno))
 #define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(errno))
 #define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(errno))
+#endif
 
 // Integration tests are compiled with -ffreestanding which stops treating
 // the main function as a non-overloadable special function. Hence, we use a
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 31d1e9dce8204..3197b3d7fd01b 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -204,5 +204,6 @@ add_header_library(
     ErrnoCheckingTest.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.properties.architectures
     libc.src.errno.errno
 )
diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h
index 5b1bc9441d830..111d812c58612 100644
--- a/libc/test/UnitTest/ErrnoCheckingTest.h
+++ b/libc/test/UnitTest/ErrnoCheckingTest.h
@@ -11,11 +11,17 @@
 
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/Test.h"
 
 // Define macro to validate the value stored in the errno and restore it
 // to zero.
 
+#ifdef LIBC_TARGET_ARCH_IS_GPU
+#define ASSERT_ERRNO_EQ(VAL)
+#define ASSERT_ERRNO_SUCCESS()
+#define ASSERT_ERRNO_FAILURE()
+#else
 #define ASSERT_ERRNO_EQ(VAL)                                                   \
   do {                                                                         \
     ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
@@ -27,6 +33,7 @@
     ASSERT_NE(0, static_cast<int>(libc_errno));                                \
     libc_errno = 0;                                                            \
   } while (0)
+#endif
 
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
diff --git a/libc/test/UnitTest/MemoryMatcher.cpp b/libc/test/UnitTest/MemoryMatcher.cpp
index 6e375768e9333..405f226798f7a 100644
--- a/libc/test/UnitTest/MemoryMatcher.cpp
+++ b/libc/test/UnitTest/MemoryMatcher.cpp
@@ -41,8 +41,8 @@ bool MemoryMatcher::match(MemoryView actualValue) {
 
 static void display(char C) {
   const auto print = [](unsigned char i) {
-    tlog << static_cast<char>(LIBC_NAMESPACE::internal::toupper(
-        LIBC_NAMESPACE::internal::int_to_b36_char(i)));
+    tlog << LIBC_NAMESPACE::internal::toupper(
+        LIBC_NAMESPACE::internal::int_to_b36_char(i));
   };
   print(static_cast<unsigned char>(C) / 16);
   print(static_cast<unsigned char>(C) & 15);
diff --git a/libc/test/src/ctype/islower_test.cpp b/libc/test/src/ctype/islower_test.cpp
index f877171abb9a3..e4e5f5cefd954 100644
--- a/libc/test/src/ctype/islower_test.cpp
+++ b/libc/test/src/ctype/islower_test.cpp
@@ -40,7 +40,7 @@ TEST(LlvmLibcIsLower, SimpleTest) {
 }
 
 TEST(LlvmLibcIsLower, DefaultLocale) {
-  // Loops through all characters, verifying that numbers and letters
+  // Loops through all characters, verifying that only lowercase letters
   // return non-zero integer and everything else returns a zero.
   for (int ch = -255; ch < 255; ++ch) {
     if (in_span(ch, LOWER_ARRAY))
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c785..3a7da1fa85ac7 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -177,8 +177,8 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     char small_string[4] = {'\0', '\0', '\0', '\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         if (first_digit < base) {
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
@@ -192,11 +192,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<char>(
-              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           if (first_digit < base && second_digit < base) {
             ASSERT_EQ(
                 func(small_string, nullptr, base),
@@ -216,14 +216,14 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<char>(
-            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_char(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<char>(
-              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_char(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = static_cast<char>(
-                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit));
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_char(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/libc/test/src/wchar/WcstolTest.h b/libc/test/src/wchar/WcstolTest.h
index 4d5b752e62238..cadf9e0c42b90 100644
--- a/libc/test/src/wchar/WcstolTest.h
+++ b/libc/test/src/wchar/WcstolTest.h
@@ -178,8 +178,8 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     wchar_t small_string[4] = {L'\0', L'\0', L'\0', L'\0'};
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         if (first_digit < base) {
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
@@ -193,11 +193,11 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<wchar_t>(
-              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit);
           if (first_digit < base && second_digit < base) {
             ASSERT_EQ(
                 func(small_string, nullptr, base),
@@ -217,14 +217,14 @@ struct WcstoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
     for (int base = 2; base <= 36; ++base) {
       for (int first_digit = 0; first_digit <= 36; ++first_digit) {
-        small_string[0] = static_cast<wchar_t>(
-            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit));
+        small_string[0] =
+            LIBC_NAMESPACE::internal::int_to_b36_wchar(first_digit);
         for (int second_digit = 0; second_digit <= 36; ++second_digit) {
-          small_string[1] = static_cast<wchar_t>(
-              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit));
+          small_string[1] =
+              LIBC_NAMESPACE::internal::int_to_b36_wchar(second_digit);
           for (int third_digit = 0; third_digit <= limit; ++third_digit) {
-            small_string[2] = static_cast<wchar_t>(
-                LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit));
+            small_string[2] =
+                LIBC_NAMESPACE::internal::int_to_b36_wchar(third_digit);
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
diff --git a/libcxxabi/src/demangle/cp-to-llvm.sh b/libcxxabi/src/demangle/cp-to-llvm.sh
index f773dff9f0a8b..9c1db6fec29a6 100755
--- a/libcxxabi/src/demangle/cp-to-llvm.sh
+++ b/libcxxabi/src/demangle/cp-to-llvm.sh
@@ -42,6 +42,7 @@ copy_files() {
     chmod -w $dst/README.txt
 
     for I in $hdrs ; do
+	    echo "Copying ${src}/$I to ${dst}/$I"
 	    rm -f $dst/$I
 	    dash=$(echo "$I---------------------------" | cut -c -27 |\
 		       sed 's|[^-]*||')
@@ -53,6 +54,6 @@ copy_files() {
 }
 
 if [[ $ANSWER =~ ^[Yy]$ ]]; then
-    copy_files . $LLVM_DEMANGLE_DIR $HDRS
-    copy_files ../../test $LLVM_TESTING_DIR $TEST_HDRS
+    copy_files . $LLVM_DEMANGLE_DIR "${HDRS}"
+    copy_files ../../test $LLVM_TESTING_DIR "${TEST_HDRS}"
 fi
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 01b5546fee00d..0736e6ba132c8 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -62,11 +62,16 @@ if (LLDB_ENABLE_PYTHON)
   set(cachestring_LLDB_PYTHON_EXT_SUFFIX
     "Filename extension for native code python modules")
 
+  if (LLDB_ENABLE_PYTHON_LIMITED_API)
+    set(stable_abi "--stable-abi")
+  endif()
+
   foreach(var LLDB_PYTHON_RELATIVE_PATH LLDB_PYTHON_EXE_RELATIVE_PATH LLDB_PYTHON_EXT_SUFFIX)
     if(NOT DEFINED ${var} AND NOT CMAKE_CROSSCOMPILING)
       execute_process(
         COMMAND ${Python3_EXECUTABLE}
           ${CMAKE_CURRENT_SOURCE_DIR}/bindings/python/get-python-config.py
+          ${stable_abi}
           ${var}
         OUTPUT_VARIABLE value
         OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/lldb/bindings/python/get-python-config.py b/lldb/bindings/python/get-python-config.py
index ae84cbb1215a9..bf8cc48b013e1 100755
--- a/lldb/bindings/python/get-python-config.py
+++ b/lldb/bindings/python/get-python-config.py
@@ -18,6 +18,9 @@ def relpath_nodots(path, base):
 def main():
     parser = argparse.ArgumentParser(description="extract cmake variables from python")
     parser.add_argument("variable_name")
+    parser.add_argument(
+        "--stable-abi", action="store_true", help="Target the Stable C ABI"
+    )
     args = parser.parse_args()
     if args.variable_name == "LLDB_PYTHON_RELATIVE_PATH":
         # LLDB_PYTHON_RELATIVE_PATH is the relative path from lldb's prefix
@@ -68,7 +71,10 @@ def main():
                     print("sys.prefix:", sys.prefix, file=sys.stderr)
                     sys.exit(1)
     elif args.variable_name == "LLDB_PYTHON_EXT_SUFFIX":
-        print(sysconfig.get_config_var("EXT_SUFFIX"))
+        if args.stable_abi:
+            print(".abi3%s" % sysconfig.get_config_var("SHLIB_SUFFIX"))
+        else:
+            print(sysconfig.get_config_var("EXT_SUFFIX"))
     else:
         parser.error(f"unknown variable {args.variable_name}")
 
diff --git a/lldb/include/lldb/lldb-private-types.h b/lldb/include/lldb/lldb-private-types.h
index b82a2b8aa0574..185467e91bf62 100644
--- a/lldb/include/lldb/lldb-private-types.h
+++ b/lldb/include/lldb/lldb-private-types.h
@@ -102,13 +102,18 @@ struct RegisterSet {
 /// A type-erased pair of llvm::dwarf::SourceLanguageName and version.
 struct SourceLanguage {
   SourceLanguage() = default;
-  SourceLanguage(lldb::LanguageType language_type);
+  explicit SourceLanguage(lldb::LanguageType language_type);
+
   SourceLanguage(uint16_t name, uint32_t version)
       : name(name), version(version) {}
-  SourceLanguage(std::optional<std::pair<uint16_t, uint32_t>> name_vers)
+
+  explicit SourceLanguage(
+      std::optional<std::pair<uint16_t, uint32_t>> name_vers)
       : name(name_vers ? name_vers->first : 0),
         version(name_vers ? name_vers->second : 0) {}
-  operator bool() const { return name > 0; }
+
+  explicit operator bool() const { return name > 0; }
+
   lldb::LanguageType AsLanguageType() const;
   llvm::StringRef GetDescription() const;
   bool IsC() const;
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index f25209c15e007..25285beb7ffd5 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -251,7 +251,7 @@ bool BreakpointLocation::ConditionSaysStop(ExecutionContext &exe_ctx,
     }
 
     m_user_expression_sp.reset(GetTarget().GetUserExpressionForLanguage(
-        condition.GetText(), llvm::StringRef(), language,
+        condition.GetText(), llvm::StringRef(), SourceLanguage{language},
         Expression::eResultTypeAny, EvaluateExpressionOptions(), nullptr,
         error));
     if (error.Fail()) {
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index 0d9eb45732161..40f00c90bbbfb 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -95,9 +95,9 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
   StackFrame *frame = m_exe_ctx.GetFramePtr();
 
   // Either the language was explicitly specified, or we check the frame.
-  lldb::LanguageType language = m_expr_options.language;
-  if (language == lldb::eLanguageTypeUnknown && frame)
-    language = frame->GuessLanguage().AsLanguageType();
+  SourceLanguage language{m_expr_options.language};
+  if (!language && frame)
+    language = frame->GuessLanguage();
 
   // Add a hint if object description was requested, but no description
   // function was implemented.
@@ -119,8 +119,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
         "^<\\S+: 0x[[:xdigit:]]{5,}>\\s*$");
 
     if (GetDebugger().GetShowDontUsePoHint() && target_ptr &&
-        (language == lldb::eLanguageTypeSwift ||
-         language == lldb::eLanguageTypeObjC) &&
+        (language.AsLanguageType() == lldb::eLanguageTypeSwift ||
+         language.IsObjC()) &&
         std::regex_match(output.data(), swift_class_regex)) {
 
       result.AppendNote(
@@ -193,7 +193,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command,
 
   // Second, try `expr` as a persistent variable.
   if (expr.starts_with("$"))
-    if (auto *state = target.GetPersistentExpressionStateForLanguage(language))
+    if (auto *state = target.GetPersistentExpressionStateForLanguage(
+            language.AsLanguageType()))
       if (auto var_sp = state->GetVariable(expr))
         if (auto valobj_sp = var_sp->GetValueObject()) {
           dump_val_object(*valobj_sp);
diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp
index af4b477660eeb..5563eba21777e 100644
--- a/lldb/source/Expression/UserExpression.cpp
+++ b/lldb/source/Expression/UserExpression.cpp
@@ -246,7 +246,7 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx,
   // language in the target's properties if specified, else default to the
   // langage for the frame.
   if (!language) {
-    if (target->GetLanguage() != lldb::eLanguageTypeUnknown)
+    if (target->GetLanguage())
       language = target->GetLanguage();
     else if (StackFrame *frame = exe_ctx.GetFramePtr())
       language = frame->GetLanguage();
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 990074566be7e..6bab880b4d521 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -1502,7 +1502,7 @@ lldb_private::Status ClangExpressionParser::DoPrepareForExecution(
     LLDB_LOGF(log, "%s - Current expression language is %s\n", __FUNCTION__,
               lang.GetDescription().data());
     lldb::ProcessSP process_sp = exe_ctx.GetProcessSP();
-    if (process_sp && lang != lldb::eLanguageTypeUnknown) {
+    if (process_sp && lang) {
       auto runtime = process_sp->GetLanguageRuntime(lang.AsLanguageType());
       if (runtime)
         runtime->GetIRPasses(custom_passes);
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
index 1c575e90bd72c..46cf9b8524ede 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/UdtRecordCompleter.cpp
@@ -442,6 +442,10 @@ void UdtRecordCompleter::Record::ConstructRecord() {
 
   // The end offset to a vector of field/struct that ends at the offset.
   std::map<uint64_t, std::vector<Member *>> end_offset_map;
+  auto is_last_end_offset = [&](auto it) {
+    return it != end_offset_map.end() && ++it == end_offset_map.end();
+  };
+
   for (auto &pair : fields_map) {
     uint64_t offset = pair.first;
     auto &fields = pair.second;
@@ -462,8 +466,23 @@ void UdtRecordCompleter::Record::ConstructRecord() {
       }
       if (iter->second.empty())
         continue;
-      parent = iter->second.back();
-      iter->second.pop_back();
+
+      // If the new fields come after the already added ones
+      // without overlap, go back to the root.
+      if (iter->first <= offset && is_last_end_offset(iter)) {
+        if (record.kind == Member::Struct) {
+          parent = &record;
+        } else {
+          assert(record.kind == Member::Union &&
+                 "Current record must be a union");
+          assert(!record.fields.empty());
+          // For unions, append the field to the last struct
+          parent = record.fields.back().get();
+        }
+      } else {
+        parent = iter->second.back();
+        iter->second.pop_back();
+      }
     }
     // If it's a field, then the field is inside a union, so we can safely
     // increase its size by converting it to a struct to hold multiple fields.
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index 2ed58c5331df4..95b515412d693 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1344,18 +1344,18 @@ const char *StackFrame::GetDisplayFunctionName() {
 SourceLanguage StackFrame::GetLanguage() {
   CompileUnit *cu = GetSymbolContext(eSymbolContextCompUnit).comp_unit;
   if (cu)
-    return cu->GetLanguage();
+    return SourceLanguage{cu->GetLanguage()};
   return {};
 }
 
 SourceLanguage StackFrame::GuessLanguage() {
   SourceLanguage lang_type = GetLanguage();
 
-  if (lang_type == eLanguageTypeUnknown) {
+  if (!lang_type) {
     SymbolContext sc =
         GetSymbolContext(eSymbolContextFunction | eSymbolContextSymbol);
     if (sc.function)
-      lang_type = LanguageType(sc.function->GetMangled().GuessLanguage());
+      lang_type = SourceLanguage(sc.function->GetMangled().GuessLanguage());
     else if (sc.symbol)
       lang_type = SourceLanguage(sc.symbol->GetMangled().GuessLanguage());
   }
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index a23091ad09c6d..e53fc7a1e1bda 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -4945,7 +4945,7 @@ void TargetProperties::SetStandardErrorPath(llvm::StringRef path) {
 
 SourceLanguage TargetProperties::GetLanguage() const {
   const uint32_t idx = ePropertyLanguage;
-  return {GetPropertyAtIndexAs<LanguageType>(idx, {})};
+  return SourceLanguage{GetPropertyAtIndexAs<LanguageType>(idx, {})};
 }
 
 llvm::StringRef TargetProperties::GetExpressionPrefixContents() {
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
index 36bfdb9a8e565..83ed533eb13e3 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/class_layout.cpp
@@ -34,9 +34,6 @@
 // CHECK-NEXT:           s4 = {
 // CHECK-NEXT:             x = ([0] = 67, [1] = 68, [2] = 99)
 // CHECK-NEXT:           }
-// CHECK-NEXT:           s1 = {
-// CHECK-NEXT:             x = ([0] = 69, [1] = 70, [2] = 71)
-// CHECK-NEXT:           }
 // CHECK-NEXT:         }
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }
@@ -47,6 +44,9 @@
 // CHECK-NEXT:       c2 = 'D'
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
+// CHECK-NEXT:   s1 = {
+// CHECK-NEXT:     x = ([0] = 69, [1] = 70, [2] = 71)
+// CHECK-NEXT:   }
 // CHECK-NEXT: }
 // CHECK-NEXT: (lldb) type lookup C
 // CHECK-NEXT: struct C {
@@ -63,7 +63,6 @@
 // CHECK-NEXT:                 struct {
 // CHECK-NEXT:                     char c4;
 // CHECK-NEXT:                     S3 s4;
-// CHECK-NEXT:                     S3 s1;
 // CHECK-NEXT:                 };
 // CHECK-NEXT:             };
 // CHECK-NEXT:         };
@@ -72,6 +71,7 @@
 // CHECK-NEXT:             char c2;
 // CHECK-NEXT:         };
 // CHECK-NEXT:     };
+// CHECK-NEXT:     S3 s1;
 // CHECK-NEXT: }
 
 
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
index 9d0d60fdaaed9..c8dce75af05eb 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
@@ -14,6 +14,12 @@
 #include "DNBLog.h"
 #include <cassert>
 #include <mach/mach_vm.h>
+#include <mach/vm_statistics.h>
+
+// From <mach/vm_statistics.h>, but not on older OSs.
+#ifndef VM_MEMORY_SANITIZER
+#define VM_MEMORY_SANITIZER 99
+#endif
 
 MachVMRegion::MachVMRegion(task_t task)
     : m_task(task), m_addr(INVALID_NUB_ADDRESS), m_err(),
diff --git a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
index 7060638a94864..433d48fab9d85 100644
--- a/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
+++ b/lldb/tools/lldb-dap/src-ts/debug-adapter-factory.ts
@@ -6,6 +6,7 @@ import * as fs from "node:fs/promises";
 import { ConfigureButton, OpenSettingsButton } from "./ui/show-error-message";
 import { ErrorWithNotification } from "./ui/error-with-notification";
 import { LogFilePathProvider, LogType } from "./logging";
+import { expandUser } from "./utils";
 
 const exec = util.promisify(child_process.execFile);
 
@@ -116,8 +117,9 @@ async function getDAPExecutable(
   configuration: vscode.DebugConfiguration,
 ): Promise<string> {
   // Check if the executable was provided in the launch configuration.
-  const launchConfigPath = configuration["debugAdapterExecutable"];
+  let launchConfigPath = configuration["debugAdapterExecutable"];
   if (typeof launchConfigPath === "string" && launchConfigPath.length !== 0) {
+    launchConfigPath = expandUser(launchConfigPath);
     if (!(await isExecutable(launchConfigPath))) {
       throw new ErrorWithNotification(
         `Debug adapter path "${launchConfigPath}" is not a valid file. The path comes from your launch configuration.`,
@@ -129,7 +131,7 @@ async function getDAPExecutable(
 
   // Check if the executable was provided in the extension's configuration.
   const config = vscode.workspace.getConfiguration("lldb-dap", workspaceFolder);
-  const configPath = config.get<string>("executable-path");
+  const configPath = expandUser(config.get<string>("executable-path") ?? "");
   if (configPath && configPath.length !== 0) {
     if (!(await isExecutable(configPath))) {
       throw new ErrorWithNotification(
diff --git a/lldb/tools/lldb-dap/src-ts/utils.ts b/lldb/tools/lldb-dap/src-ts/utils.ts
new file mode 100644
index 0000000000000..efebe0b0f42ba
--- /dev/null
+++ b/lldb/tools/lldb-dap/src-ts/utils.ts
@@ -0,0 +1,41 @@
+import * as os from "os";
+import * as path from "path";
+
+/**
+ * Expands the character `~` to the user's home directory
+ */
+export function expandUser(file_path: string): string {
+  if (os.platform() == "win32") {
+    return file_path;
+  }
+
+  if (!file_path) {
+    return "";
+  }
+
+  if (!file_path.startsWith("~")) {
+    return file_path;
+  }
+
+  const path_len = file_path.length;
+  if (path_len == 1) {
+    return os.homedir();
+  }
+
+  if (file_path.charAt(1) == path.sep) {
+    return path.join(os.homedir(), file_path.substring(1));
+  }
+
+  const sep_index = file_path.indexOf(path.sep);
+  const user_name_end = sep_index == -1 ? file_path.length : sep_index;
+  const user_name = file_path.substring(1, user_name_end);
+  try {
+    if (user_name == os.userInfo().username) {
+      return path.join(os.homedir(), file_path.substring(user_name_end));
+    }
+  } catch (err) {
+    return file_path;
+  }
+
+  return file_path;
+}
diff --git a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
index 17284b61b9a6e..cd6db5fcb1f4c 100644
--- a/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
+++ b/lldb/unittests/SymbolFile/NativePDB/UdtRecordCompleterTests.cpp
@@ -99,7 +99,7 @@ Member *AddField(Member *member, StringRef name, uint64_t byte_offset,
       std::make_unique<Member>(name, byte_offset * 8, byte_size * 8,
                                clang::QualType(), lldb::eAccessPublic, 0);
   field->kind = kind;
-  field->base_offset = base_offset;
+  field->base_offset = base_offset * 8;
   member->fields.push_back(std::move(field));
   return member->fields.back().get();
 }
@@ -111,6 +111,9 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   CollectMember("m2", 0, 4);
   CollectMember("m3", 0, 1);
   CollectMember("m4", 0, 8);
+  CollectMember("m5", 8, 8);
+  CollectMember("m6", 16, 4);
+  CollectMember("m7", 16, 8);
   ConstructRecord();
 
   // struct {
@@ -120,6 +123,11 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   //       m3;
   //       m4;
   //   };
+  //   m5;
+  //   union {
+  //       m6;
+  //       m7;
+  //   };
   // };
   Record record;
   record.start_offset = 0;
@@ -128,6 +136,10 @@ TEST_F(UdtRecordCompleterRecordTests, TestAnonymousUnionInStruct) {
   AddField(u, "m2", 0, 4, Member::Field);
   AddField(u, "m3", 0, 1, Member::Field);
   AddField(u, "m4", 0, 8, Member::Field);
+  AddField(&record.record, "m5", 8, 8, Member::Field);
+  Member *u2 = AddField(&record.record, "", 16, 0, Member::Union);
+  AddField(u2, "m6", 16, 4, Member::Field);
+  AddField(u2, "m7", 16, 8, Member::Field);
   EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
 }
 
@@ -243,3 +255,41 @@ TEST_F(UdtRecordCompleterRecordTests, TestNestedUnionStructInUnion) {
   AddField(s2, "m4", 2, 4, Member::Field);
   EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
 }
+
+TEST_F(UdtRecordCompleterRecordTests, TestNestedStructInUnionInStructInUnion) {
+  SetKind(Member::Kind::Union);
+  CollectMember("m1", 0, 4);
+  CollectMember("m2", 0, 2);
+  CollectMember("m3", 0, 2);
+  CollectMember("m4", 2, 4);
+  CollectMember("m5", 6, 2);
+  CollectMember("m6", 6, 2);
+  CollectMember("m7", 8, 2);
+  ConstructRecord();
+
+  // union {
+  //   m1;
+  //   m2;
+  //   struct {
+  //       m3;
+  //       m4;
+  //       union {
+  //           m5;
+  //           m6;
+  //       };
+  //       m7;
+  //   };
+  // };
+  Record record;
+  record.start_offset = 0;
+  AddField(&record.record, "m1", 0, 4, Member::Field);
+  AddField(&record.record, "m2", 0, 2, Member::Field);
+  Member *s = AddField(&record.record, "", 0, 0, Member::Struct);
+  AddField(s, "m3", 0, 2, Member::Field);
+  AddField(s, "m4", 2, 4, Member::Field);
+  Member *u = AddField(s, "", 6, 0, Member::Union);
+  AddField(u, "m5", 6, 2, Member::Field);
+  AddField(u, "m6", 6, 2, Member::Field);
+  AddField(s, "m7", 8, 2, Member::Field);
+  EXPECT_EQ(WrappedRecord(this->record), WrappedRecord(record));
+}
diff --git a/llvm/docs/GoldPlugin.rst b/llvm/docs/GoldPlugin.rst
index 07d2fc203eba5..606f9e0820e60 100644
--- a/llvm/docs/GoldPlugin.rst
+++ b/llvm/docs/GoldPlugin.rst
@@ -83,7 +83,7 @@ which is why you otherwise need gold to be the installed system linker in
 your path.
 
 ``ar`` and ``nm`` also accept the ``-plugin`` option and it's possible to
-to install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
+install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
 If you built your own gold, be sure to install the ``ar`` and ``nm-new`` you
 built to ``/usr/bin``.
 
@@ -143,7 +143,7 @@ Quickstart for using LTO with autotooled projects
 =================================================
 
 Once your system ``ld``, ``ar``, and ``nm`` all support LLVM bitcode,
-everything is in place for an easy to use LTO build of autotooled projects:
+everything is in place for an easy-to-use LTO build of autotooled projects:
 
 * Follow the instructions :ref:`on how to build LLVMgold.so
   <lto-how-to-build>`.
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index 3511776d3e4c1..528d14d71a1d9 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -143,7 +143,10 @@ struct identity // NOLINT(readability-identifier-naming)
 /// The std::pointer_traits<>::to_address(p) variations of these overloads has
 /// not been implemented.
 template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
-template <class T> constexpr T *to_address(T *P) { return P; }
+template <class T> constexpr T *to_address(T *P) {
+  static_assert(!std::is_function_v<T>);
+  return P;
+}
 
 //===----------------------------------------------------------------------===//
 //     Features from C++23
diff --git a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
index cb535ac14f1c6..a1b030c157eae 100644
--- a/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
+++ b/llvm/include/llvm/Analysis/DXILMetadataAnalysis.h
@@ -27,6 +27,9 @@ struct EntryProperties {
   unsigned NumThreadsX{0}; // X component
   unsigned NumThreadsY{0}; // Y component
   unsigned NumThreadsZ{0}; // Z component
+  unsigned WaveSizeMin{0}; // Minimum component
+  unsigned WaveSizeMax{0}; // Maximum component
+  unsigned WaveSizePref{0}; // Preferred component
 
   EntryProperties(const Function *Fn = nullptr) : Entry(Fn) {};
 };
diff --git a/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
new file mode 100644
index 0000000000000..e8eceeed6aca6
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/LibcallLoweringInfo.h
@@ -0,0 +1,66 @@
+//===- LibcallLoweringInfo.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/RuntimeLibcalls.h"
+
+namespace llvm {
+
+class LibcallLoweringInfo {
+private:
+  LLVM_ABI const RTLIB::RuntimeLibcallsInfo &RTLCI;
+  /// Stores the implementation choice for each each libcall.
+  LLVM_ABI RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
+      RTLIB::Unsupported};
+
+public:
+  LLVM_ABI LibcallLoweringInfo(const RTLIB::RuntimeLibcallsInfo &RTLCI);
+
+  /// Get the libcall routine name for the specified libcall.
+  // FIXME: This should be removed. Only LibcallImpl should have a name.
+  LLVM_ABI const char *getLibcallName(RTLIB::Libcall Call) const {
+    // FIXME: Return StringRef
+    return RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpls[Call])
+        .data();
+  }
+
+  /// Return the lowering's selection of implementation call for \p Call
+  LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
+    return LibcallImpls[Call];
+  }
+
+  /// Rename the default libcall routine name for the specified libcall.
+  LLVM_ABI void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
+    LibcallImpls[Call] = Impl;
+  }
+
+  // FIXME: Remove this wrapper in favor of directly using
+  // getLibcallImplCallingConv
+  LLVM_ABI CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
+    return RTLCI.LibcallImplCallingConvs[LibcallImpls[Call]];
+  }
+
+  /// Get the CallingConv that should be used for the specified libcall.
+  LLVM_ABI CallingConv::ID
+  getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
+    return RTLCI.LibcallImplCallingConvs[Call];
+  }
+
+  /// Return a function impl compatible with RTLIB::MEMCPY, or
+  /// RTLIB::Unsupported if fully unsupported.
+  RTLIB::LibcallImpl getMemcpyImpl() const {
+    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
+    if (Memcpy == RTLIB::Unsupported) {
+      // Fallback to memmove if memcpy isn't available.
+      return getLibcallImpl(RTLIB::MEMMOVE);
+    }
+
+    return Memcpy;
+  }
+};
+
+} // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b229659415d55..8aeaa9cdacfc1 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
 #include "llvm/CodeGen/LowLevelTypeUtils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
@@ -3232,6 +3233,11 @@ class LLVM_ABI TargetLoweringBase {
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
+  /// Return true if the target interleave with shuffles are cheaper
+  virtual bool isProfitableToInterleaveWithGatherScatter() const {
+    return false;
+  }
+
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
@@ -3597,7 +3603,7 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   const RTLIB::RuntimeLibcallsInfo &getRuntimeLibcallsInfo() const {
-    return Libcalls;
+    return RuntimeLibcallInfo;
   }
 
   void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
@@ -3610,9 +3616,9 @@ class LLVM_ABI TargetLoweringBase {
   }
 
   /// Get the libcall routine name for the specified libcall.
+  // FIXME: This should be removed. Only LibcallImpl should have a name.
   const char *getLibcallName(RTLIB::Libcall Call) const {
-    // FIXME: Return StringRef
-    return Libcalls.getLibcallName(Call).data();
+    return Libcalls.getLibcallName(Call);
   }
 
   /// Get the libcall routine name for the specified libcall implementation
@@ -3625,7 +3631,7 @@ class LLVM_ABI TargetLoweringBase {
   /// Check if this is valid libcall for the current module, otherwise
   /// RTLIB::Unsupported.
   RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const {
-    return Libcalls.getSupportedLibcallImpl(FuncName);
+    return RuntimeLibcallInfo.getSupportedLibcallImpl(FuncName);
   }
 
   /// Get the comparison predicate that's to be used to test the result of the
@@ -3633,11 +3639,6 @@ class LLVM_ABI TargetLoweringBase {
   /// floating-point compare libcalls.
   ISD::CondCode getSoftFloatCmpLibcallPredicate(RTLIB::LibcallImpl Call) const;
 
-  /// Set the CallingConv that should be used for the specified libcall.
-  void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
-    Libcalls.setLibcallImplCallingConv(Call, CC);
-  }
-
   /// Get the CallingConv that should be used for the specified libcall
   /// implementation.
   CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
@@ -3834,8 +3835,11 @@ class LLVM_ABI TargetLoweringBase {
   std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
     PromoteToType;
 
+  /// FIXME: This should not live here; it should come from an analysis.
+  const RTLIB::RuntimeLibcallsInfo RuntimeLibcallInfo;
+
   /// The list of libcalls that the target will use.
-  RTLIB::RuntimeLibcallsInfo Libcalls;
+  LibcallLoweringInfo Libcalls;
 
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
index 155cfe8dd3a98..711aa70a4a8d3 100644
--- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
+++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -708,7 +708,7 @@ struct DEMANGLE_ABI SpecialTableSymbolNode : public SymbolNode {
     return N->kind() == NodeKind::SpecialTableSymbol;
   }
 
-  QualifiedNameNode *TargetName = nullptr;
+  NodeArrayNode *TargetNames = nullptr;
   Qualifiers Quals = Qualifiers::Q_None;
 };
 
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index a0876b169e0b8..a7bcbf010d1bf 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -194,10 +194,11 @@ LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I,
 /// Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch
 /// weights in the new instruction if the parent function of the original
 /// instruction has an entry count. This is to not confuse users by injecting
-/// profile data into non-profiled functions.
-LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
-                                                          Function &F,
-                                                          StringRef PassName);
+/// profile data into non-profiled functions. If \p F is nullptr, we will fetch
+/// the function from \p I.
+LLVM_ABI void
+setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName,
+                                            const Function *F = nullptr);
 
 /// Analogous to setExplicitlyUnknownBranchWeights, but for functions and their
 /// entry counts.
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index bae760b3f981d..78e4b1723aafa 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -42,6 +42,8 @@ template <> struct enum_iteration_traits<RTLIB::LibcallImpl> {
   static constexpr bool is_iterable = true;
 };
 
+class LibcallLoweringInfo;
+
 namespace RTLIB {
 
 // Return an iterator over all Libcall values.
@@ -70,6 +72,8 @@ struct RuntimeLibcallsInfo {
   LibcallImplBitset AvailableLibcallImpls;
 
 public:
+  friend class llvm::LibcallLoweringInfo;
+
   explicit RuntimeLibcallsInfo(
       const Triple &TT,
       ExceptionHandling ExceptionModel = ExceptionHandling::None,
@@ -85,17 +89,6 @@ struct RuntimeLibcallsInfo {
     initLibcalls(TT, ExceptionModel, FloatABI, EABIVersion, ABIName);
   }
 
-  /// Rename the default libcall routine name for the specified libcall.
-  void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
-    LibcallImpls[Call] = Impl;
-  }
-
-  /// Get the libcall routine name for the specified libcall.
-  // FIXME: This should be removed. Only LibcallImpl should have a name.
-  StringRef getLibcallName(RTLIB::Libcall Call) const {
-    return getLibcallImplName(LibcallImpls[Call]);
-  }
-
   /// Get the libcall routine name for the specified libcall implementation.
   static StringRef getLibcallImplName(RTLIB::LibcallImpl CallImpl) {
     if (CallImpl == RTLIB::Unsupported)
@@ -105,42 +98,24 @@ struct RuntimeLibcallsInfo {
                      RuntimeLibcallNameSizeTable[CallImpl]);
   }
 
-  /// Return the lowering's selection of implementation call for \p Call
-  RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const {
-    return LibcallImpls[Call];
-  }
-
   /// Set the CallingConv that should be used for the specified libcall
   /// implementation
   void setLibcallImplCallingConv(RTLIB::LibcallImpl Call, CallingConv::ID CC) {
     LibcallImplCallingConvs[Call] = CC;
   }
 
-  // FIXME: Remove this wrapper in favor of directly using
-  // getLibcallImplCallingConv
-  CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
-    return LibcallImplCallingConvs[LibcallImpls[Call]];
-  }
-
   /// Get the CallingConv that should be used for the specified libcall.
   CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const {
     return LibcallImplCallingConvs[Call];
   }
 
-  ArrayRef<RTLIB::LibcallImpl> getLibcallImpls() const {
-    // Trim UNKNOWN_LIBCALL from the back
-    return ArrayRef(LibcallImpls).drop_back();
+  /// Return the libcall provided by \p Impl
+  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
+    return ImplToLibcall[Impl];
   }
 
-  /// Return a function name compatible with RTLIB::MEMCPY, or nullptr if fully
-  /// unsupported.
-  RTLIB::LibcallImpl getMemcpyImpl() const {
-    RTLIB::LibcallImpl Memcpy = getLibcallImpl(RTLIB::MEMCPY);
-    if (Memcpy != RTLIB::Unsupported)
-      return Memcpy;
-
-    // Fallback to memmove if memcpy isn't available.
-    return getLibcallImpl(RTLIB::MEMMOVE);
+  unsigned getNumAvailableLibcallImpls() const {
+    return AvailableLibcallImpls.count();
   }
 
   bool isAvailable(RTLIB::LibcallImpl Impl) const {
@@ -151,11 +126,6 @@ struct RuntimeLibcallsInfo {
     AvailableLibcallImpls.set(Impl);
   }
 
-  /// Return the libcall provided by \p Impl
-  static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
-    return ImplToLibcall[Impl];
-  }
-
   /// Check if a function name is a recognized runtime call of any kind. This
   /// does not consider if this call is available for any current compilation,
   /// just that it is a known call somewhere. This returns the set of all
@@ -176,11 +146,8 @@ struct RuntimeLibcallsInfo {
   LLVM_ABI RTLIB::LibcallImpl
       getSupportedLibcallImpl(StringRef FuncName) const {
     for (RTLIB::LibcallImpl Impl : lookupLibcallImplName(FuncName)) {
-      // FIXME: This should not depend on looking up ImplToLibcall, only the
-      // list of libcalls for the module.
-      RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]];
-      if (Recognized != RTLIB::Unsupported)
-        return Recognized;
+      if (isAvailable(Impl))
+        return Impl;
     }
 
     return RTLIB::Unsupported;
@@ -197,10 +164,6 @@ struct RuntimeLibcallsInfo {
   LLVM_ABI static iota_range<RTLIB::LibcallImpl>
   lookupLibcallImplNameImpl(StringRef Name);
 
-  /// Stores the implementation choice for each each libcall.
-  RTLIB::LibcallImpl LibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1] = {
-      RTLIB::Unsupported};
-
   static_assert(static_cast<int>(CallingConv::C) == 0,
                 "default calling conv should be encoded as 0");
 
@@ -274,6 +237,7 @@ struct RuntimeLibcallsInfo {
 };
 
 } // namespace RTLIB
+
 } // namespace llvm
 
 #endif // LLVM_IR_RUNTIME_LIBCALLS_H
diff --git a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
index b5752c1b69ad8..92853125379f5 100644
--- a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
+++ b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td
@@ -61,7 +61,6 @@ class RuntimeLibcall {
 class RuntimeLibcallImpl<RuntimeLibcall P, string Name = NAME> {
   RuntimeLibcall Provides = P;
   string LibCallFuncName = Name;
-  list<LibcallLoweringPredicate> LoweringPredicates;
   bit IsDefault = false;
 }
 
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 9bbb8a2a30541..0a253efc2abcb 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -225,7 +225,7 @@ inline constexpr int64_t minIntN(int64_t N) {
 
   if (N == 0)
     return 0;
-  return UINT64_C(1) + ~(UINT64_C(1) << (N - 1));
+  return UINT64_MAX << (N - 1);
 }
 
 /// Gets the maximum value for a N-bit signed integer.
@@ -241,7 +241,7 @@ inline constexpr int64_t maxIntN(int64_t N) {
 
 /// Checks if an unsigned integer fits into the given (dynamic) bit width.
 inline constexpr bool isUIntN(unsigned N, uint64_t x) {
-  return N >= 64 || x <= maxUIntN(N);
+  return N >= 64 || (x >> N) == 0;
 }
 
 /// Checks if an signed integer fits into the given (dynamic) bit width.
diff --git a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
index 23f1aa82ae8a3..bd77cba385667 100644
--- a/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
+++ b/llvm/lib/Analysis/DXILMetadataAnalysis.cpp
@@ -66,6 +66,22 @@ static ModuleMetadataInfo collectMetadataInfo(Module &M) {
       Success = llvm::to_integer(NumThreadsVec[2], EFP.NumThreadsZ, 10);
       assert(Success && "Failed to parse Z component of numthreads");
     }
+    // Get wavesize attribute value, if one exists
+    StringRef WaveSizeStr =
+        F.getFnAttribute("hlsl.wavesize").getValueAsString();
+    if (!WaveSizeStr.empty()) {
+      SmallVector<StringRef> WaveSizeVec;
+      WaveSizeStr.split(WaveSizeVec, ',');
+      assert(WaveSizeVec.size() == 3 && "Invalid wavesize specified");
+      // Read in the three component values of numthreads
+      [[maybe_unused]] bool Success =
+          llvm::to_integer(WaveSizeVec[0], EFP.WaveSizeMin, 10);
+      assert(Success && "Failed to parse Min component of wavesize");
+      Success = llvm::to_integer(WaveSizeVec[1], EFP.WaveSizeMax, 10);
+      assert(Success && "Failed to parse Max component of wavesize");
+      Success = llvm::to_integer(WaveSizeVec[2], EFP.WaveSizePref, 10);
+      assert(Success && "Failed to parse Preferred component of wavesize");
+    }
     MMDAI.EntryPropertyVec.push_back(EFP);
   }
   return MMDAI;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f0ec9bf885c12..3f5387738c328 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1755,7 +1755,6 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   OutStreamer->pushSection();
   OutStreamer->switchSection(FuncCGSection);
 
-  const MCSymbol *FunctionSymbol = getFunctionBegin();
   const Function &F = MF.getFunction();
   // If this function has external linkage or has its address taken and
   // it is not a callback, then anything could call it.
@@ -1794,7 +1793,7 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
   // 8) Each unique indirect target type id.
   OutStreamer->emitInt8(CallGraphSectionFormatVersion::V_0);
   OutStreamer->emitInt8(static_cast<uint8_t>(CGFlags));
-  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  OutStreamer->emitSymbolValue(getSymbol(&F), TM.getProgramPointerSize());
   const auto *TypeId = extractNumericCGTypeId(F);
   if (IsIndirectTarget && TypeId)
     OutStreamer->emitInt64(TypeId->getZExtValue());
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 6412949948c07..d9bc042d6807e 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1301,7 +1301,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
   // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
   // hard to predict precise branch weigths we mark the branch as "unknown"
   // (50/50) to prevent misleading optimizations.
-  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return Loaded;
@@ -1686,7 +1686,12 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
 
   Loaded->addIncoming(NewLoaded, LoopBB);
 
-  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+  Instruction *CondBr = Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  // Atomic RMW expands to a cmpxchg loop, Since precise branch weights
+  // cannot be easily determined here, we mark the branch as "unknown" (50/50)
+  // to prevent misleading optimizations.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return NewLoaded;
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 4373c5397a3c6..1cf0b4964760b 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -88,6 +88,7 @@ add_llvm_component_library(LLVMCodeGen
   LatencyPriorityQueue.cpp
   LazyMachineBlockFrequencyInfo.cpp
   LexicalScopes.cpp
+  LibcallLoweringInfo.cpp
   LiveDebugVariables.cpp
   LiveIntervals.cpp
   LiveInterval.cpp
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 5c27a20869f81..45eca28ffb8a2 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
-                               unsigned MaxFactor) {
+                               unsigned MaxFactor,
+                               bool InterleaveWithShuffles) {
   unsigned NumElts = SVI->getShuffleMask().size();
   if (NumElts < 4)
     return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
       return true;
   }
 
+  if (InterleaveWithShuffles) {
+    for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
+      Factor = i * MaxFactor;
+      if (SVI->isInterleave(Factor))
+        return true;
+    }
+  }
   return false;
 }
 
@@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor,
+                          TLI->isProfitableToInterleaveWithGatherScatter()))
     return false;
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/CodeGen/LibcallLoweringInfo.cpp b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
new file mode 100644
index 0000000000000..5c1698cb6060e
--- /dev/null
+++ b/llvm/lib/CodeGen/LibcallLoweringInfo.cpp
@@ -0,0 +1,26 @@
+//===- LibcallLoweringInfo.cpp - Interface for runtime libcalls -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LibcallLoweringInfo.h"
+
+using namespace llvm;
+
+LibcallLoweringInfo::LibcallLoweringInfo(
+    const RTLIB::RuntimeLibcallsInfo &RTLCI)
+    : RTLCI(RTLCI) {
+  // TODO: This should be generated with lowering predicates, and assert the
+  // call is available.
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (RTLCI.isAvailable(Impl)) {
+      RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+      // FIXME: Hack, assume the first available libcall wins.
+      if (LibcallImpls[LC] == RTLIB::Unsupported)
+        LibcallImpls[LC] = Impl;
+    }
+  }
+}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b3535eaca5e9d..1cc591c17f9c3 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -697,9 +697,11 @@ ISD::CondCode TargetLoweringBase::getSoftFloatCmpLibcallPredicate(
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), Libcalls(TM.getTargetTriple(), TM.Options.ExceptionModel,
-                       TM.Options.FloatABIType, TM.Options.EABIVersion,
-                       TM.Options.MCOptions.getABIName()) {
+    : TM(tm),
+      RuntimeLibcallInfo(TM.getTargetTriple(), TM.Options.ExceptionModel,
+                         TM.Options.FloatABIType, TM.Options.EABIVersion,
+                         TM.Options.MCOptions.getABIName()),
+      Libcalls(RuntimeLibcallInfo) {
   initActions();
 
   // Perform these initializations only once.
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index b22928be3be50..0aefe6e077c24 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -277,6 +277,18 @@ demanglePointerCVQualifiers(std::string_view &MangledName) {
   DEMANGLE_UNREACHABLE;
 }
 
+static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
+                                          size_t Count) {
+  NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
+  N->Count = Count;
+  N->Nodes = Arena.allocArray<Node *>(Count);
+  for (size_t I = 0; I < Count; ++I) {
+    N->Nodes[I] = Head->N;
+    Head = Head->Next;
+  }
+  return N;
+}
+
 std::string_view Demangler::copyString(std::string_view Borrowed) {
   char *Stable = Arena.allocUnalignedBuffer(Borrowed.size());
   // This is not a micro-optimization, it avoids UB, should Borrowed be an null
@@ -323,8 +335,30 @@ Demangler::demangleSpecialTableSymbolNode(std::string_view &MangledName,
   }
 
   std::tie(STSN->Quals, IsMember) = demangleQualifiers(MangledName);
-  if (!consumeFront(MangledName, '@'))
-    STSN->TargetName = demangleFullyQualifiedTypeName(MangledName);
+
+  NodeList *TargetCurrent = nullptr;
+  NodeList *TargetHead = nullptr;
+  size_t Count = 0;
+  while (!consumeFront(MangledName, '@')) {
+    ++Count;
+
+    NodeList *Next = Arena.alloc<NodeList>();
+    if (TargetCurrent)
+      TargetCurrent->Next = Next;
+    else
+      TargetHead = Next;
+
+    TargetCurrent = Next;
+    QualifiedNameNode *QN = demangleFullyQualifiedTypeName(MangledName);
+    if (Error)
+      return nullptr;
+    assert(QN);
+    TargetCurrent->N = QN;
+  }
+
+  if (Count > 0)
+    STSN->TargetNames = nodeListToNodeArray(Arena, TargetHead, Count);
+
   return STSN;
 }
 
@@ -1605,18 +1639,6 @@ Demangler::demangleNameScopePiece(std::string_view &MangledName) {
   return demangleSimpleName(MangledName, /*Memorize=*/true);
 }
 
-static NodeArrayNode *nodeListToNodeArray(ArenaAllocator &Arena, NodeList *Head,
-                                          size_t Count) {
-  NodeArrayNode *N = Arena.alloc<NodeArrayNode>();
-  N->Count = Count;
-  N->Nodes = Arena.allocArray<Node *>(Count);
-  for (size_t I = 0; I < Count; ++I) {
-    N->Nodes[I] = Head->N;
-    Head = Head->Next;
-  }
-  return N;
-}
-
 QualifiedNameNode *
 Demangler::demangleNameScopeChain(std::string_view &MangledName,
                                   IdentifierNode *UnqualifiedName) {
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 61e4961c714bc..17c6aab500049 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -662,9 +662,9 @@ void VcallThunkIdentifierNode::output(OutputBuffer &OB,
 void SpecialTableSymbolNode::output(OutputBuffer &OB, OutputFlags Flags) const {
   outputQualifiers(OB, Quals, false, true);
   Name->output(OB, Flags);
-  if (TargetName) {
+  if (TargetNames) {
     OB << "{for `";
-    TargetName->output(OB, Flags);
+    TargetNames->output(OB, Flags, "'s `");
     OB << "'}";
   }
 }
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 46cf60be1bafd..98f10a5a60f24 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -758,14 +758,12 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
 
 AbstractSlotTrackerStorage::~AbstractSlotTrackerStorage() = default;
 
-namespace llvm {
-
 //===----------------------------------------------------------------------===//
 // SlotTracker Class: Enumerate slot numbers for unnamed values
 //===----------------------------------------------------------------------===//
 /// This class provides computation of slot numbers for LLVM Assembly writing.
 ///
-class SlotTracker : public AbstractSlotTrackerStorage {
+class llvm::SlotTracker : public AbstractSlotTrackerStorage {
 public:
   /// ValueMap - A mapping of Values to slot numbers.
   using ValueMap = DenseMap<const Value *, unsigned>;
@@ -943,8 +941,6 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   void processDbgRecordMetadata(const DbgRecord &DVR);
 };
 
-} // end namespace llvm
-
 ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
                                      const Function *F)
     : M(M), F(F), Machine(&Machine) {}
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 01dafcab94ce9..bfba6e0cab6bf 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -10,10 +10,11 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
 
+using namespace llvm;
+
 #if LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 #include "llvm/Support/Signals.h"
 
-namespace llvm {
 DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) {
   if (!ShouldCollectTrace)
     return;
@@ -30,11 +31,8 @@ void DbgLocOrigin::addTrace() {
   auto &[Depth, StackTrace] = StackTraces.emplace_back();
   Depth = sys::getStackTrace(StackTrace);
 }
-} // namespace llvm
 #endif
 
-using namespace llvm;
-
 #if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
     : TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L),
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 3fc3d28ba34fd..926a009b7831f 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -12,8 +12,9 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Compiler.h"
 
-namespace llvm {
+using namespace llvm;
 
+namespace llvm {
 template <typename T>
 DbgRecordParamRef<T>::DbgRecordParamRef(const T *Param)
     : Ref(const_cast<T *>(Param)) {}
@@ -28,6 +29,7 @@ template <typename T> T *DbgRecordParamRef<T>::get() const {
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DIExpression>;
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILabel>;
 template class LLVM_EXPORT_TEMPLATE DbgRecordParamRef<DILocalVariable>;
+} // namespace llvm
 
 DbgVariableRecord::DbgVariableRecord(const DbgVariableIntrinsic *DVI)
     : DbgRecord(ValueKind, DVI->getDebugLoc()),
@@ -756,5 +758,3 @@ iterator_range<simple_ilist<DbgRecord>::iterator> DbgMarker::cloneDebugInfoFrom(
     // We inserted a block at the end, return that range.
     return {First->getIterator(), StoredDbgRecords.end()};
 }
-
-} // end namespace llvm
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index 67f21d3756e93..c41d7b3181a37 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -19,9 +19,10 @@
 #include "llvm/IR/Intrinsics.h"
 #include <optional>
 
-namespace llvm {
+using namespace llvm;
 
-std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
+std::optional<RoundingMode>
+llvm::convertStrToRoundingMode(StringRef RoundingArg) {
   // For dynamic rounding mode, we use round to nearest but we will set the
   // 'exact' SDNodeFlag so that the value will not be rounded.
   return StringSwitch<std::optional<RoundingMode>>(RoundingArg)
@@ -34,7 +35,8 @@ std::optional<RoundingMode> convertStrToRoundingMode(StringRef RoundingArg) {
       .Default(std::nullopt);
 }
 
-std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
+std::optional<StringRef>
+llvm::convertRoundingModeToStr(RoundingMode UseRounding) {
   std::optional<StringRef> RoundingStr;
   switch (UseRounding) {
   case RoundingMode::Dynamic:
@@ -62,7 +64,7 @@ std::optional<StringRef> convertRoundingModeToStr(RoundingMode UseRounding) {
 }
 
 std::optional<fp::ExceptionBehavior>
-convertStrToExceptionBehavior(StringRef ExceptionArg) {
+llvm::convertStrToExceptionBehavior(StringRef ExceptionArg) {
   return StringSwitch<std::optional<fp::ExceptionBehavior>>(ExceptionArg)
       .Case("fpexcept.ignore", fp::ebIgnore)
       .Case("fpexcept.maytrap", fp::ebMayTrap)
@@ -71,7 +73,7 @@ convertStrToExceptionBehavior(StringRef ExceptionArg) {
 }
 
 std::optional<StringRef>
-convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
+llvm::convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   std::optional<StringRef> ExceptStr;
   switch (UseExcept) {
   case fp::ebStrict:
@@ -87,7 +89,7 @@ convertExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   return ExceptStr;
 }
 
-Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
+Intrinsic::ID llvm::getConstrainedIntrinsicID(const Instruction &Instr) {
   Intrinsic::ID IID = Intrinsic::not_intrinsic;
   switch (Instr.getOpcode()) {
   case Instruction::FCmp:
@@ -127,5 +129,3 @@ Intrinsic::ID getConstrainedIntrinsicID(const Instruction &Instr) {
 
   return IID;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 88dbd176e0d3f..95edb2e8e56d8 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1019,8 +1019,7 @@ Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
                                                      const Twine &Name) {
   Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
   if (auto *SI = dyn_cast<SelectInst>(Ret)) {
-    setExplicitlyUnknownBranchWeightsIfProfiled(
-        *SI, *SI->getParent()->getParent(), PassName);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*SI, PassName);
   }
   return Ret;
 }
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 39e5463cb6fc3..c3e54a0fc0c7e 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -17,7 +17,8 @@
 
 #include "ConstantsContext.h"
 
-namespace llvm {
+using namespace llvm;
+
 bool Operator::hasPoisonGeneratingFlags() const {
   switch (getOpcode()) {
   case Instruction::Add:
@@ -288,4 +289,3 @@ void FastMathFlags::print(raw_ostream &O) const {
       O << " afn";
   }
 }
-} // namespace llvm
diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index 4e27086e97ac5..cb1b91a98b036 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -32,10 +32,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "time-passes"
 
-namespace llvm {
+using namespace llvm;
 
-bool TimePassesIsEnabled = false;
-bool TimePassesPerRun = false;
+bool llvm::TimePassesIsEnabled = false;
+bool llvm::TimePassesPerRun = false;
 
 static cl::opt<bool, true> EnableTiming(
     "time-passes", cl::location(TimePassesIsEnabled), cl::Hidden,
@@ -139,7 +139,7 @@ PassTimingInfo *PassTimingInfo::TheTimeInfo;
 } // namespace legacy
 } // namespace
 
-Timer *getPassTimer(Pass *P) {
+Timer *llvm::getPassTimer(Pass *P) {
   legacy::PassTimingInfo::init();
   if (legacy::PassTimingInfo::TheTimeInfo)
     return legacy::PassTimingInfo::TheTimeInfo->getPassTimer(P, P);
@@ -148,7 +148,7 @@ Timer *getPassTimer(Pass *P) {
 
 /// If timing is enabled, report the times collected up to now and then reset
 /// them.
-void reportAndResetTimings(raw_ostream *OutStream) {
+void llvm::reportAndResetTimings(raw_ostream *OutStream) {
   if (legacy::PassTimingInfo::TheTimeInfo)
     legacy::PassTimingInfo::TheTimeInfo->print(OutStream);
 }
@@ -315,5 +315,3 @@ void TimePassesHandler::registerCallbacks(PassInstrumentationCallbacks &PIC) {
   PIC.registerAfterAnalysisCallback(
       [this](StringRef P, Any) { this->stopAnalysisTimer(P); });
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index fc2be5188f456..94dbe1f3988b8 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -274,9 +274,12 @@ void llvm::setExplicitlyUnknownBranchWeights(Instruction &I,
 }
 
 void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
-                                                       Function &F,
-                                                       StringRef PassName) {
-  if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+                                                       StringRef PassName,
+                                                       const Function *F) {
+  F = F ? F : I.getFunction();
+  assert(F && "Either pass a instruction attached to a Function, or explicitly "
+              "pass the Function that it will be attached to");
+  if (std::optional<Function::ProfileCount> EC = F->getEntryCount();
       EC && EC->getCount() > 0)
     setExplicitlyUnknownBranchWeights(I, PassName);
 }
diff --git a/llvm/lib/IR/PseudoProbe.cpp b/llvm/lib/IR/PseudoProbe.cpp
index 59f218cc3683b..3c05f4b1f86a2 100644
--- a/llvm/lib/IR/PseudoProbe.cpp
+++ b/llvm/lib/IR/PseudoProbe.cpp
@@ -19,9 +19,7 @@
 
 using namespace llvm;
 
-namespace llvm {
-
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
 extractProbeFromDiscriminator(const DILocation *DIL) {
   if (DIL) {
     auto Discriminator = DIL->getDiscriminator();
@@ -43,7 +41,7 @@ extractProbeFromDiscriminator(const DILocation *DIL) {
   return std::nullopt;
 }
 
-std::optional<PseudoProbe>
+static std::optional<PseudoProbe>
 extractProbeFromDiscriminator(const Instruction &Inst) {
   assert(isa<CallBase>(&Inst) && !isa<IntrinsicInst>(&Inst) &&
          "Only call instructions should have pseudo probe encodes as their "
@@ -53,7 +51,7 @@ extractProbeFromDiscriminator(const Instruction &Inst) {
   return std::nullopt;
 }
 
-std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
+std::optional<PseudoProbe> llvm::extractProbe(const Instruction &Inst) {
   if (const auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
     PseudoProbe Probe;
     Probe.Id = II->getIndex()->getZExtValue();
@@ -73,7 +71,7 @@ std::optional<PseudoProbe> extractProbe(const Instruction &Inst) {
   return std::nullopt;
 }
 
-void setProbeDistributionFactor(Instruction &Inst, float Factor) {
+void llvm::setProbeDistributionFactor(Instruction &Inst, float Factor) {
   assert(Factor >= 0 && Factor <= 1 &&
          "Distribution factor must be in [0, 1.0]");
   if (auto *II = dyn_cast<PseudoProbeInst>(&Inst)) {
@@ -111,5 +109,3 @@ void setProbeDistributionFactor(Instruction &Inst, float Factor) {
     }
   }
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp
index 962368f061851..b3586b45a23f2 100644
--- a/llvm/lib/IR/ReplaceConstant.cpp
+++ b/llvm/lib/IR/ReplaceConstant.cpp
@@ -16,7 +16,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 
-namespace llvm {
+using namespace llvm;
 
 static bool isExpandableUser(User *U) {
   return isa<ConstantExpr>(U) || isa<ConstantAggregate>(U);
@@ -49,10 +49,10 @@ static SmallVector<Instruction *, 4> expandUser(BasicBlock::iterator InsertPt,
   return NewInsts;
 }
 
-bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
-                                           Function *RestrictToFunc,
-                                           bool RemoveDeadConstants,
-                                           bool IncludeSelf) {
+bool llvm::convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
+                                                 Function *RestrictToFunc,
+                                                 bool RemoveDeadConstants,
+                                                 bool IncludeSelf) {
   // Find all expandable direct users of Consts.
   SmallVector<Constant *> Stack;
   for (Constant *C : Consts) {
@@ -121,5 +121,3 @@ bool convertUsersOfConstantsToInstructions(ArrayRef<Constant *> Consts,
 
   return Changed;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 2ce5719228a0d..2fb01a4f95fea 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -19,6 +19,7 @@
 using namespace llvm;
 using namespace RTLIB;
 
+#define GET_RUNTIME_LIBCALLS_INFO
 #define GET_INIT_RUNTIME_LIBCALL_NAMES
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 67882ba0144b4..504233575594d 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -9,7 +9,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 
-namespace llvm {
+using namespace llvm;
 
 void Use::swap(Use &RHS) {
   if (Val == RHS.Val)
@@ -42,5 +42,3 @@ void Use::zap(Use *Start, const Use *Stop, bool del) {
   if (del)
     ::operator delete(Start);
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index ab44cb4b8a3f7..9bb7c1298593a 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -11,8 +11,11 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IntrinsicInst.h"
 
+using namespace llvm;
+
 namespace llvm {
 class BasicBlock;
+}
 
 //===----------------------------------------------------------------------===//
 //                                 User Class
@@ -214,5 +217,3 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) {
     ::operator delete(Storage);
   }
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 6d23dad2e185b..9b888927e4e20 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -136,9 +136,7 @@ static cl::opt<bool> VerifyNoAliasScopeDomination(
     cl::desc("Ensure that llvm.experimental.noalias.scope.decl for identical "
              "scopes are not dominating"));
 
-namespace llvm {
-
-struct VerifierSupport {
+struct llvm::VerifierSupport {
   raw_ostream *OS;
   const Module &M;
   ModuleSlotTracker MST;
@@ -318,8 +316,6 @@ struct VerifierSupport {
   }
 };
 
-} // namespace llvm
-
 namespace {
 
 class Verifier : public InstVisitor<Verifier>, VerifierSupport {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 23be42f9d60ce..fefc733fa7697 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1396,11 +1396,10 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
   RTLIB::RuntimeLibcallsInfo Libcalls(TT);
   SmallVector<const char *> LibcallSymbols;
-  ArrayRef<RTLIB::LibcallImpl> LibcallImpls = Libcalls.getLibcallImpls();
-  LibcallSymbols.reserve(LibcallImpls.size());
+  LibcallSymbols.reserve(Libcalls.getNumAvailableLibcallImpls());
 
-  for (RTLIB::LibcallImpl Impl : LibcallImpls) {
-    if (Impl != RTLIB::Unsupported)
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (Libcalls.isAvailable(Impl))
       LibcallSymbols.push_back(Libcalls.getLibcallImplName(Impl).data());
   }
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index e5e5fc20728e8..29f291614ffc6 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -37,8 +37,6 @@ unsigned Object::getMachine() const {
     return *Header.Machine;
   return llvm::ELF::EM_NONE;
 }
-
-constexpr StringRef SectionHeaderTable::TypeStr;
 } // namespace ELFYAML
 
 namespace yaml {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d08f9b94227a2..298746863d221 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -96,6 +96,7 @@
 #include <cctype>
 #include <cstdint>
 #include <cstdlib>
+#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
@@ -17989,11 +17990,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   unsigned Factor,
                                                   const APInt &GapMask) const {
 
-  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
-         "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
+
+  if (isProfitableToInterleaveWithGatherScatter() &&
+      Factor > getMaxSupportedInterleaveFactor())
+    return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
   assert(!LaneMask && GapMask.popcount() == Factor &&
          "Unexpected mask on store");
 
@@ -18139,6 +18146,126 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
+/// If the interleaved vector elements are greater than supported MaxFactor,
+/// interleaving the data with additional shuffles can be used to
+/// achieve the same.
+///
+/// Consider the following data with 8 interleaves which are shuffled to store
+/// stN instructions. Data needs to be stored in this order:
+///     [v0, v1, v2, v3, v4, v5, v6, v7]
+///
+///    v0      v4      v2      v6      v1      v5      v3      v7
+///    |       |       |       |       |       |       |       |
+///     \     /         \     /         \     /         \     /
+///   [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7] ==> stN = 4
+///        |               |              |                 |
+///         \             /                \               /
+///          \           /                  \             /
+///           \         /                    \           /
+///       [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
+///
+/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
+/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
+/// another st4.
+///
+/// For stN = 2, upper half of interleaved data V0, V1 is stored
+/// with one st2 instruction. Second set V2, V3 is stored with another st2.
+/// Total of 4 st2's are required here.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool UseScalable;
+
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+    return false;
+
+  if (UseScalable)
+    return false;
+
+  std::deque<Value *> Shuffles;
+  Shuffles.push_back(SVI);
+  unsigned ConcatLevel = Factor;
+  // Getting all the interleaved operands.
+  while (ConcatLevel > 1) {
+    unsigned InterleavedOperands = Shuffles.size();
+    for (unsigned i = 0; i < InterleavedOperands; i++) {
+      ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(Shuffles.front());
+      if (!SFL)
+        return false;
+      Shuffles.pop_front();
+
+      Value *Op0 = SFL->getOperand(0);
+      Value *Op1 = SFL->getOperand(1);
+
+      Shuffles.push_back(dyn_cast<Value>(Op0));
+      Shuffles.push_back(dyn_cast<Value>(Op1));
+    }
+    ConcatLevel >>= 1;
+  }
+
+  IRBuilder<> Builder(SI);
+  auto Mask = createInterleaveMask(LaneLen, 2);
+  SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
+  for (unsigned i = 0; i < LaneLen; i++) {
+    LowerHalfMask[i] = Mask[i];
+    UpperHalfMask[i] = Mask[i + LaneLen];
+  }
+
+  unsigned InterleaveFactor = Factor >> 1;
+  while (InterleaveFactor >= MaxSupportedFactor) {
+    std::deque<Value *> ShufflesIntermediate;
+    ShufflesIntermediate.resize(Factor);
+    for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
+      for (unsigned i = 0; i < InterleaveFactor; i++) {
+        auto *Shuffle = Builder.CreateShuffleVector(
+            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
+        ShufflesIntermediate[i + j] = Shuffle;
+        Shuffle = Builder.CreateShuffleVector(
+            Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
+        ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle;
+      }
+    }
+    Shuffles = ShufflesIntermediate;
+    InterleaveFactor >>= 1;
+  }
+
+  Type *PtrTy = SI->getPointerOperandType();
+  auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+  Value *BaseAddr = SI->getPointerOperand();
+  Function *StNFunc = getStructuredStoreFunction(
+      SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+  for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
+    SmallVector<Value *, 5> Ops;
+    for (unsigned j = 0; j < MaxSupportedFactor; j++)
+      Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
+
+    if (i > 0) {
+      // We will compute the pointer operand of each store from the original
+      // base address using GEPs. Cast the base address to a pointer to the
+      // scalar  element type.
+      BaseAddr = Builder.CreateConstGEP1_32(
+          SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+    }
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
+  return true;
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae717fb76..bfd8474bfeec9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
+  bool isProfitableToInterleaveWithGatherScatter() const override {
+    return true;
+  }
+
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering {
                              ShuffleVectorInst *SVI, unsigned Factor,
                              const APInt &GapMask) const override;
 
+  bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+                                        unsigned Factor) const;
+
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 30b7b03f7a69a..52b216c7fe0f0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -197,6 +197,12 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_PMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
 def G_UADDLP : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
@@ -273,6 +279,7 @@ def : GINodeEquiv<G_FCMGT, AArch64fcmgt>;
 
 def : GINodeEquiv<G_BSP, AArch64bsp>;
 
+def : GINodeEquiv<G_PMULL, AArch64pmull>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6e03cb1..8729ed3890131 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4922,11 +4922,36 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
     return InstructionCost::getInvalid();
 
-  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  unsigned NumLoadStores = 1;
+  InstructionCost ShuffleCost = 0;
+  bool isInterleaveWithShuffle = false;
+  unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+  auto *SubVecTy =
+      VectorType::get(VecVTy->getElementType(),
+                      VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+  if (TLI->isProfitableToInterleaveWithGatherScatter() &&
+      Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
+      Factor > MaxSupportedFactor) {
+    isInterleaveWithShuffle = true;
+    SmallVector<int, 16> Mask;
+    // preparing interleave Mask.
+    for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
+         i++) {
+      for (unsigned j = 0; j < 2; j++)
+        Mask.push_back(j * Factor + i);
+    }
+
+    NumLoadStores = Factor / MaxSupportedFactor;
+    ShuffleCost =
+        (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+                                 Mask, CostKind, 0, SubVecTy));
+  }
+
+  if (!UseMaskForGaps &&
+      (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
-    auto *SubVecTy =
-        VectorType::get(VecVTy->getElementType(),
-                        VecVTy->getElementCount().divideCoefficientBy(Factor));
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -4934,7 +4959,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
     bool UseScalable;
     if (MinElts % Factor == 0 &&
         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+      return (Factor *
+              TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+              NumLoadStores) +
+             ShuffleCost;
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5f93847bc680e..038ad77ae69b2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1809,6 +1809,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerBinOp(TargetOpcode::G_FMAXNUM);
   case Intrinsic::aarch64_neon_fminnm:
     return LowerBinOp(TargetOpcode::G_FMINNUM);
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_pmull64:
+    return LowerBinOp(AArch64::G_PMULL);
   case Intrinsic::aarch64_neon_smull:
     return LowerBinOp(AArch64::G_SMULL);
   case Intrinsic::aarch64_neon_umull:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 6d2d70511e894..6b920f05227ad 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -560,6 +560,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   case TargetOpcode::G_FCMP:
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
+  case AArch64::G_PMULL:
     return true;
   case TargetOpcode::G_INTRINSIC:
     switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 54d94b1f8682e..4fe194c813c46 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2366,6 +2366,18 @@ def isGFX8GFX9NotGFX90A :
             " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
   AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>;
 
+// Pre-90A GFX9s allow the NV bit in FLAT instructions.
+def isNVAllowedInFlat :
+  Predicate<"!Subtarget->hasGFX90AInsts() &&"
+            " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">,
+  AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureGFX90AInsts), (not FeatureGFX10Insts))>;
+
+// GFX8 or GFX90A+ do not allow the NV bit in FLAT instructions.
+def isNVNotAllowedInFlat :
+  Predicate<"(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) ||"
+            " ((Subtarget->getGeneration() == AMDGPUSubtarget::GFX9) && Subtarget->hasGFX90AInsts())">,
+  AssemblerPredicate <(any_of FeatureVolcanicIslands, FeatureGFX90AInsts)>;
+
 def isGFX90AOnly :
   Predicate<"Subtarget->hasGFX90AInsts() && !Subtarget->hasGFX940Insts()">,
   AssemblerPredicate<(all_of FeatureGFX90AInsts, (not FeatureGFX940Insts))>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0a5913293238a..fdff21b6ef8df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1565,8 +1565,11 @@ void SplitPtrStructs::processConditionals() {
     } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
         if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
-          ConditionalTemps.push_back(RsrcInst);
-          RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          // Guard against conditionals that were already folded away.
+          if (RsrcInst != *MaybeRsrc) {
+            ConditionalTemps.push_back(RsrcInst);
+            RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+          }
         }
         for (Value *V : Seen)
           FoundRsrcs[V] = *MaybeRsrc;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 09338c533fdf2..2808c44c59c11 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1602,6 +1602,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 
   bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
 
+  bool isFlatInstAndNVAllowed(const MCInst &Inst) const {
+    uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+    return (TSFlags & SIInstrFlags::FLAT) && isGFX9() && !isGFX90A();
+  }
+
   AMDGPUTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -5370,7 +5375,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scale_offset")]);
       Error(S, "scale_offset is not supported on this GPU");
     }
-    if (CPol & CPol::NV) {
+    if ((CPol & CPol::NV) && !isFlatInstAndNVAllowed(Inst)) {
       SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
       StringRef CStr(S.getPointer());
       S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]);
@@ -7145,6 +7150,13 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
   unsigned Enabled = 0, Seen = 0;
   for (;;) {
     SMLoc S = getLoc();
+
+    if (isGFX9() && trySkipId("nv")) {
+      Enabled |= CPol::NV;
+      Seen |= CPol::NV;
+      continue;
+    }
+
     bool Disabling;
     unsigned CPol = getCPolKind(getId(), Mnemo, Disabling);
     if (!CPol)
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 21b339f2c6784..95bc9438441b9 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -125,7 +125,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
   bits<7> saddr;
   bits<10> vdst;
 
-  bits<5> cpol;
+  bits<6> cpol;
 
   // Only valid on gfx9
   bits<1> lds = ps.lds; // LDS DMA for global and scratch
@@ -2759,29 +2759,52 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
                   !subst("$sccb", !if(has_sccb, "$sccb",""), ps.AsmOperands);
 }
 
+class FLAT_Real_vi_ex_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
+  FLAT_Real_vi <op, ps, has_sccb> {
+  let AssemblerPredicate = isNVNotAllowedInFlat;
+}
+
+class FLAT_Real_gfx9 <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> :
+  FLAT_Real_vi <op, ps, has_sccb> {
+  let AssemblerPredicate = isNVAllowedInFlat;
+  let Subtarget = SIEncodingFamily.GFX9;
+  let DecoderNamespace = "GFX9";
+  let Inst{55} = cpol{CPolBit.NV}; // nv - GFX9 (pre-90A) uses bit 55 as the non-volatile bit.
+}
+
+multiclass FLAT_Real_mc_vi <bits<7> op, FLAT_Pseudo ps, bit has_sccb = ps.has_sccb> {
+  def _vi: FLAT_Real_vi_ex_gfx9<op, ps, has_sccb>;
+  def _gfx9: FLAT_Real_gfx9<op, ps, has_sccb>;
+}
+
 multiclass FLAT_Real_AllAddr_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
-  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
-  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+  defm _SADDR : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
+}
+
+multiclass FLAT_Real_AllAddr_vi_ex_gfx9<bits<7> op,
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+  def _vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
+  def _SADDR_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
 }
 
 class FLAT_Real_gfx940 <bits<7> op, FLAT_Pseudo ps> :
   FLAT_Real <op, ps>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX940> {
   let AssemblerPredicate = isGFX940Plus;
-  let DecoderNamespace = "GFX9";
+  let DecoderNamespace = "GFX940";
   let Inst{13} = ps.sve;
   let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue);
 }
 
 multiclass FLAT_Real_AllAddr_SVE_vi<bits<7> op> {
-  def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)> {
-    let AssemblerPredicate = isGFX8GFX9NotGFX940;
-    let OtherPredicates = [isGFX8GFX9NotGFX940];
-  }
-  def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")> {
-    let DecoderNamespace = "GFX9";
+  let OtherPredicates = [isGFX8GFX9NotGFX940] in {
+    defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME)>;
   }
+
+  defm _SADDR_vi : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>;
+
   let AssemblerPredicate = isGFX940Plus in {
     def _VE_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME)>;
     def _SVS_gfx940 : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_SVS")>;
@@ -2794,11 +2817,11 @@ multiclass FLAT_Real_AllAddr_LDS<bits<7> op, bits<7> pre_gfx940_op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
 
   let OtherPredicates = [isGFX8GFX9NotGFX940] in {
-    def _vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb> {
-      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds";
+    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME).AsmOperands # " lds" in {
+      defm "" : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME), has_sccb>;
     }
-    def _SADDR_vi : FLAT_Real_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb> {
-      let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds";
+    let AsmString = pre_gfx940_name # !cast<FLAT_Pseudo>(NAME#"_SADDR").AsmOperands # " lds" in {
+      defm _SADDR : FLAT_Real_mc_vi<pre_gfx940_op, !cast<FLAT_Pseudo>(NAME#"_SADDR"), has_sccb>;
     }
   }
 
@@ -2814,47 +2837,66 @@ multiclass FLAT_Real_AllAddr_SVE_LDS<bits<7> op, bits<7> pre_gfx940_op> {
   def _ST_gfx940  : FLAT_Real_gfx940<op, !cast<FLAT_Pseudo>(NAME#"_ST")>;
 }
 
-def FLAT_LOAD_UBYTE_vi         : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>;
-def FLAT_LOAD_SBYTE_vi         : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>;
-def FLAT_LOAD_USHORT_vi        : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>;
-def FLAT_LOAD_SSHORT_vi        : FLAT_Real_vi <0x13, FLAT_LOAD_SSHORT>;
-def FLAT_LOAD_DWORD_vi         : FLAT_Real_vi <0x14, FLAT_LOAD_DWORD>;
-def FLAT_LOAD_DWORDX2_vi       : FLAT_Real_vi <0x15, FLAT_LOAD_DWORDX2>;
-def FLAT_LOAD_DWORDX4_vi       : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>;
-def FLAT_LOAD_DWORDX3_vi       : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>;
-
-def FLAT_STORE_BYTE_vi         : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>;
-def FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
-def FLAT_STORE_SHORT_vi        : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>;
-def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
-def FLAT_STORE_DWORD_vi        : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>;
-def FLAT_STORE_DWORDX2_vi      : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>;
-def FLAT_STORE_DWORDX4_vi      : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>;
-def FLAT_STORE_DWORDX3_vi      : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>;
-
-def FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>;
-def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
-def FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>;
-def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
-def FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>;
-def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
+defm FLAT_LOAD_UBYTE_vi         : FLAT_Real_mc_vi <0x10, FLAT_LOAD_UBYTE>;
+defm FLAT_LOAD_SBYTE_vi         : FLAT_Real_mc_vi <0x11, FLAT_LOAD_SBYTE>;
+defm FLAT_LOAD_USHORT_vi        : FLAT_Real_mc_vi <0x12, FLAT_LOAD_USHORT>;
+defm FLAT_LOAD_SSHORT_vi        : FLAT_Real_mc_vi <0x13, FLAT_LOAD_SSHORT>;
+defm FLAT_LOAD_DWORD_vi         : FLAT_Real_mc_vi <0x14, FLAT_LOAD_DWORD>;
+defm FLAT_LOAD_DWORDX2_vi       : FLAT_Real_mc_vi <0x15, FLAT_LOAD_DWORDX2>;
+defm FLAT_LOAD_DWORDX4_vi       : FLAT_Real_mc_vi <0x17, FLAT_LOAD_DWORDX4>;
+defm FLAT_LOAD_DWORDX3_vi       : FLAT_Real_mc_vi <0x16, FLAT_LOAD_DWORDX3>;
+
+defm FLAT_STORE_BYTE_vi         : FLAT_Real_mc_vi <0x18, FLAT_STORE_BYTE>;
+defm FLAT_STORE_BYTE_D16_HI_vi  : FLAT_Real_mc_vi <0x19, FLAT_STORE_BYTE_D16_HI>;
+defm FLAT_STORE_SHORT_vi        : FLAT_Real_mc_vi <0x1a, FLAT_STORE_SHORT>;
+defm FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x1b, FLAT_STORE_SHORT_D16_HI>;
+defm FLAT_STORE_DWORD_vi        : FLAT_Real_mc_vi <0x1c, FLAT_STORE_DWORD>;
+defm FLAT_STORE_DWORDX2_vi      : FLAT_Real_mc_vi <0x1d, FLAT_STORE_DWORDX2>;
+defm FLAT_STORE_DWORDX4_vi      : FLAT_Real_mc_vi <0x1f, FLAT_STORE_DWORDX4>;
+defm FLAT_STORE_DWORDX3_vi      : FLAT_Real_mc_vi <0x1e, FLAT_STORE_DWORDX3>;
+
+defm FLAT_LOAD_UBYTE_D16_vi    : FLAT_Real_mc_vi <0x20, FLAT_LOAD_UBYTE_D16>;
+defm FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>;
+defm FLAT_LOAD_SBYTE_D16_vi    : FLAT_Real_mc_vi <0x22, FLAT_LOAD_SBYTE_D16>;
+defm FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_mc_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>;
+defm FLAT_LOAD_SHORT_D16_vi    : FLAT_Real_mc_vi <0x24, FLAT_LOAD_SHORT_D16>;
+defm FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_mc_vi <0x25, FLAT_LOAD_SHORT_D16_HI>;
 
 multiclass FLAT_Real_Atomics_vi <bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
   defvar ps = !cast<FLAT_Pseudo>(NAME);
-  def _vi     : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
-  def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
-  def _RTN_agpr_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+  defm "" : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+  defm _RTN : FLAT_Real_mc_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
+}
+
+multiclass FLAT_Real_Atomics_vi_ex_gfx9 <bits<7> op,
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> {
+  defvar ps = !cast<FLAT_Pseudo>(NAME);
+  def _vi     : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr), has_sccb>;
+  def _RTN_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN"), has_sccb>;
+
+  def _RTN_agpr_vi : FLAT_Real_vi_ex_gfx9<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN_agpr"), has_sccb>;
 }
 
 multiclass FLAT_Global_Real_Atomics_vi<bits<7> op,
   bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
   FLAT_Real_AllAddr_vi<op, has_sccb> {
-  def _RTN_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
-  def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+  defm _RTN  : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+  defm _SADDR_RTN : FLAT_Real_mc_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
+
+  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+}
+
+multiclass FLAT_Global_Real_Atomics_vi_ex_gfx9<bits<7> op,
+  bit has_sccb = !cast<FLAT_Pseudo>(NAME).has_sccb> :
+  FLAT_Real_AllAddr_vi_ex_gfx9<op, has_sccb> {
+  def _RTN_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN"), has_sccb>;
+  def _SADDR_RTN_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN"), has_sccb>;
 
-  def _RTN_agpr_vi  : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
-  def _SADDR_RTN_agpr_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
+  def _RTN_agpr_vi  : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_RTN_agpr"), has_sccb>;
+  def _SADDR_RTN_agpr_vi : FLAT_Real_vi_ex_gfx9 <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN_agpr"), has_sccb>;
 }
 
 defm FLAT_ATOMIC_SWAP       : FLAT_Real_Atomics_vi <0x40>;
@@ -3016,10 +3058,10 @@ let AssemblerPredicate = isGFX940Plus in {
   defm GLOBAL_ATOMIC_ADD_F64     : FLAT_Global_Real_Atomics_gfx940<0x4f>;
   defm GLOBAL_ATOMIC_MIN_F64     : FLAT_Global_Real_Atomics_gfx940<0x50>;
   defm GLOBAL_ATOMIC_MAX_F64     : FLAT_Global_Real_Atomics_gfx940<0x51>;
-  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi<0x4d>;
-  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi<0x4e>;
-  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi<0x52>;
-  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>;
+  defm FLAT_ATOMIC_ADD_F32       : FLAT_Real_Atomics_vi_ex_gfx9<0x4d>;
+  defm FLAT_ATOMIC_PK_ADD_F16    : FLAT_Real_Atomics_vi_ex_gfx9<0x4e>;
+  defm FLAT_ATOMIC_PK_ADD_BF16   : FLAT_Real_Atomics_vi_ex_gfx9<0x52>;
+  defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi_ex_gfx9<0x52>;
 } // End AssemblerPredicate = isGFX940Plus
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 703ec0a4befa5..3e6f35dbf5e54 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -186,8 +186,12 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
     O << " dlc";
   if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
     O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
-  if (Imm & ~CPol::ALL_pregfx12)
-    O << " /* unexpected cache policy bit */";
+  if (Imm & ~CPol::ALL_pregfx12) {
+    if ((Imm & CPol::NV) && AMDGPU::isGFX9(STI) && !AMDGPU::isGFX90A(STI))
+      O << " nv";
+    else
+      O << " /* unexpected cache policy bit */";
+  }
 }
 
 void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index eb4c8846441a2..677203d1c016b 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -285,6 +285,13 @@ void DXContainerGlobals::addPipelineStateValidationInfo(
     PSV.BaseData.NumThreadsX = MMI.EntryPropertyVec[0].NumThreadsX;
     PSV.BaseData.NumThreadsY = MMI.EntryPropertyVec[0].NumThreadsY;
     PSV.BaseData.NumThreadsZ = MMI.EntryPropertyVec[0].NumThreadsZ;
+    if (MMI.EntryPropertyVec[0].WaveSizeMin) {
+      PSV.BaseData.MinimumWaveLaneCount = MMI.EntryPropertyVec[0].WaveSizeMin;
+      PSV.BaseData.MaximumWaveLaneCount =
+          MMI.EntryPropertyVec[0].WaveSizeMax
+              ? MMI.EntryPropertyVec[0].WaveSizeMax
+              : MMI.EntryPropertyVec[0].WaveSizeMin;
+    }
     break;
   default:
     break;
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index cf8b833b3e42e..e1a472fe57642 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -82,6 +82,7 @@ enum class EntryPropsTag {
   ASStateTag,
   WaveSize,
   EntryRootSig,
+  WaveRange = 23,
 };
 
 } // namespace
@@ -177,14 +178,15 @@ getTagValueAsMetadata(EntryPropsTag Tag, uint64_t Value, LLVMContext &Ctx) {
   case EntryPropsTag::ASStateTag:
   case EntryPropsTag::WaveSize:
   case EntryPropsTag::EntryRootSig:
+  case EntryPropsTag::WaveRange:
     llvm_unreachable("NYI: Unhandled entry property tag");
   }
   return MDVals;
 }
 
-static MDTuple *
-getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
-                       const Triple::EnvironmentType ShaderProfile) {
+static MDTuple *getEntryPropAsMetadata(Module &M, const EntryProperties &EP,
+                                       uint64_t EntryShaderFlags,
+                                       const ModuleMetadataInfo &MMDI) {
   SmallVector<Metadata *> MDVals;
   LLVMContext &Ctx = EP.Entry->getContext();
   if (EntryShaderFlags != 0)
@@ -195,12 +197,13 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
     // FIXME: support more props.
     // See https://github.com/llvm/llvm-project/issues/57948.
     // Add shader kind for lib entries.
-    if (ShaderProfile == Triple::EnvironmentType::Library &&
+    if (MMDI.ShaderProfile == Triple::EnvironmentType::Library &&
         EP.ShaderStage != Triple::EnvironmentType::Library)
       MDVals.append(getTagValueAsMetadata(EntryPropsTag::ShaderKind,
                                           getShaderStage(EP.ShaderStage), Ctx));
 
     if (EP.ShaderStage == Triple::EnvironmentType::Compute) {
+      // Handle mandatory "hlsl.numthreads"
       MDVals.emplace_back(ConstantAsMetadata::get(ConstantInt::get(
           Type::getInt32Ty(Ctx), static_cast<int>(EntryPropsTag::NumThreads))));
       Metadata *NumThreadVals[] = {ConstantAsMetadata::get(ConstantInt::get(
@@ -210,8 +213,48 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
                                    ConstantAsMetadata::get(ConstantInt::get(
                                        Type::getInt32Ty(Ctx), EP.NumThreadsZ))};
       MDVals.emplace_back(MDNode::get(Ctx, NumThreadVals));
+
+      // Handle optional "hlsl.wavesize". The fields are optionally represented
+      // if they are non-zero.
+      if (EP.WaveSizeMin != 0) {
+        bool IsWaveRange = VersionTuple(6, 8) <= MMDI.ShaderModelVersion;
+        bool IsWaveSize =
+            !IsWaveRange && VersionTuple(6, 6) <= MMDI.ShaderModelVersion;
+
+        if (!IsWaveRange && !IsWaveSize) {
+          reportError(M, "Shader model 6.6 or greater is required to specify "
+                         "the \"hlsl.wavesize\" function attribute");
+          return nullptr;
+        }
+
+        // A range is being specified if EP.WaveSizeMax != 0
+        if (EP.WaveSizeMax && !IsWaveRange) {
+          reportError(
+              M, "Shader model 6.8 or greater is required to specify "
+                 "wave size range values of the \"hlsl.wavesize\" function "
+                 "attribute");
+          return nullptr;
+        }
+
+        EntryPropsTag Tag =
+            IsWaveSize ? EntryPropsTag::WaveSize : EntryPropsTag::WaveRange;
+        MDVals.emplace_back(ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), static_cast<int>(Tag))));
+
+        SmallVector<Metadata *> WaveSizeVals = {ConstantAsMetadata::get(
+            ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMin))};
+        if (IsWaveRange) {
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizeMax)));
+          WaveSizeVals.push_back(ConstantAsMetadata::get(
+              ConstantInt::get(Type::getInt32Ty(Ctx), EP.WaveSizePref)));
+        }
+
+        MDVals.emplace_back(MDNode::get(Ctx, WaveSizeVals));
+      }
     }
   }
+
   if (MDVals.empty())
     return nullptr;
   return MDNode::get(Ctx, MDVals);
@@ -236,12 +279,11 @@ static MDTuple *constructEntryMetadata(const Function *EntryFn,
   return MDNode::get(Ctx, MDVals);
 }
 
-static MDTuple *emitEntryMD(const EntryProperties &EP, MDTuple *Signatures,
-                            MDNode *MDResources,
+static MDTuple *emitEntryMD(Module &M, const EntryProperties &EP,
+                            MDTuple *Signatures, MDNode *MDResources,
                             const uint64_t EntryShaderFlags,
-                            const Triple::EnvironmentType ShaderProfile) {
-  MDTuple *Properties =
-      getEntryPropAsMetadata(EP, EntryShaderFlags, ShaderProfile);
+                            const ModuleMetadataInfo &MMDI) {
+  MDTuple *Properties = getEntryPropAsMetadata(M, EP, EntryShaderFlags, MMDI);
   return constructEntryMetadata(EP.Entry, Signatures, MDResources, Properties,
                                 EP.Entry->getContext());
 }
@@ -523,10 +565,8 @@ static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
                    Twine(Triple::getEnvironmentTypeName(MMDI.ShaderProfile) +
                          "'"));
     }
-
-    EntryFnMDNodes.emplace_back(emitEntryMD(EntryProp, Signatures, ResourceMD,
-                                            EntryShaderFlags,
-                                            MMDI.ShaderProfile));
+    EntryFnMDNodes.emplace_back(emitEntryMD(
+        M, EntryProp, Signatures, ResourceMD, EntryShaderFlags, MMDI));
   }
 
   NamedMDNode *EntryPointsNamedMD =
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
index 479ac90b7d526..f29a739cb5c07 100644
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -104,13 +104,6 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
     {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
 } // namespace
 
-namespace llvm {
-
-FunctionPass *createHexagonQFPOptimizer();
-void initializeHexagonQFPOptimizerPass(PassRegistry &);
-
-} // namespace llvm
-
 namespace {
 
 struct HexagonQFPOptimizer : public MachineFunctionPass {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 0c2e44e18f463..dfbbba0116f25 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -420,6 +420,9 @@ let Predicates = [HasVSX, IsISAFuture] in {
       : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB),
                       "vucmprlh $VRT, $VRA, $VRB", []>;
 
+  def XVRLW: XX3Form_XTAB6<60, 184, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                              "xvrlw $XT, $XA, $XB", []>;
+
   // AES Acceleration Instructions
   def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp),
                                     (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 45b0e7dc12263..f3c236ca8c9ce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -532,13 +532,19 @@ struct StaticLibcallNameMap {
     // FIXME: This is broken if there are ever different triples compiled with
     // different libcalls.
     RTLIB::RuntimeLibcallsInfo RTCI(TT);
-    for (RTLIB::Libcall LC : RTLIB::libcalls()) {
-      StringRef NameLibcall = RTCI.getLibcallName(LC);
-      if (!NameLibcall.empty() &&
-          getRuntimeLibcallSignatures().Table[LC] != unsupported) {
-        assert(!Map.contains(NameLibcall) &&
-               "duplicate libcall names in name map");
-        Map[NameLibcall] = LC;
+
+    ArrayRef<RuntimeLibcallSignature> Table =
+        getRuntimeLibcallSignatures().Table;
+    for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+      if (!RTCI.isAvailable(Impl))
+        continue;
+      RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl);
+      if (Table[LC] != unsupported) {
+        StringRef NameLibcall =
+            RTLIB::RuntimeLibcallsInfo::getLibcallImplName(Impl);
+        // FIXME: Map should be to LibcallImpl
+        if (!Map.insert({NameLibcall, LC}).second)
+          llvm_unreachable("duplicate libcall names in name map");
       }
     }
   }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
index bd4d4ebd2a729..5977a276b1236 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp
@@ -320,7 +320,7 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo,
   case Xtensa::SSIP:
   case Xtensa::LSI:
   case Xtensa::LSIP:
-
+  case Xtensa::S32C1I:
     if (Res & 0x3) {
       report_fatal_error("Unexpected operand value!");
     }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 4e730707dcb78..8d0fd078b2696 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -202,7 +202,7 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
     return FeatureBits[Xtensa::FeatureWindowed];
   case Xtensa::ATOMCTL:
   case Xtensa::SCOMPARE1:
-    return FeatureBits[Xtensa::FeatureWindowed];
+    return FeatureBits[Xtensa::FeatureS32C1I];
   case Xtensa::NoRegister:
     return false;
   }
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
index b0f924f2cd58e..be69cefb5b78f 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp
@@ -114,14 +114,31 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   const DebugLoc &DL, Register DestReg,
                                   Register SrcReg, bool KillSrc,
                                   bool RenamableDest, bool RenamableSrc) const {
-  // The MOV instruction is not present in core ISA,
+  unsigned Opcode;
+
+  // The MOV instruction is not present in core ISA for AR registers,
   // so use OR instruction.
-  if (Xtensa::ARRegClass.contains(DestReg, SrcReg))
+  if (Xtensa::ARRegClass.contains(DestReg, SrcReg)) {
     BuildMI(MBB, MBBI, DL, get(Xtensa::OR), DestReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
         .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+      Xtensa::FPRRegClass.contains(DestReg))
+    Opcode = Xtensa::MOV_S;
+  else if (STI.hasSingleFloat() && Xtensa::FPRRegClass.contains(SrcReg) &&
+           Xtensa::ARRegClass.contains(DestReg))
+    Opcode = Xtensa::RFR;
+  else if (STI.hasSingleFloat() && Xtensa::ARRegClass.contains(SrcReg) &&
+           Xtensa::FPRRegClass.contains(DestReg))
+    Opcode = Xtensa::WFR;
   else
     report_fatal_error("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void XtensaInstrInfo::storeRegToStackSlot(
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7a95df4b2a47c..b575d76e897d2 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1378,8 +1378,7 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
       IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
   // We can't know the precise weights here, as they would depend on the value
   // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
-  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
-                                              DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
   Type *IndexTy = DL.getIndexType(Call->getType());
   SmallVector<DominatorTree::UpdateType, 8> Updates;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index d85e4f7590197..9bdd8cb71f7f3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -479,7 +479,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                      const Twine &NameStr = "",
                                      InsertPosition InsertBefore = nullptr) {
     auto *Sel = SelectInst::Create(C, S1, S2, NameStr, InsertBefore, nullptr);
-    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, F, DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Sel, DEBUG_TYPE, &F);
     return Sel;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 0577ddbd2353c..0f3e66476f055 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -330,8 +330,7 @@ static void buildPartialUnswitchConditionalBranch(
       HasBranchWeights ? ComputeProfFrom.getMetadata(LLVMContext::MD_prof)
                        : nullptr);
   if (!HasBranchWeights)
-    setExplicitlyUnknownBranchWeightsIfProfiled(
-        *BR, *BR->getParent()->getParent(), DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
 }
 
 /// Copy a set of loop invariant values, and conditionally branch on them.
@@ -389,8 +388,7 @@ static void buildPartialInvariantUnswitchConditionalBranch(
       IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                        Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
   if (!ProfData)
-    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
-                                                DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, DEBUG_TYPE);
 }
 
 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -3204,8 +3202,7 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
   auto *InvariantBr =
       Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
   // We don't know anything about the relation between the limits.
-  setExplicitlyUnknownBranchWeightsIfProfiled(
-      *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
+  setExplicitlyUnknownBranchWeightsIfProfiled(*InvariantBr, DEBUG_TYPE);
 
   Builder.SetInsertPoint(CheckBlock);
   Builder.CreateCondBr(
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 6d4436b92c119..dd8706cfb2855 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -54,8 +54,8 @@ PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
   const DataLayout &DL = M.getDataLayout();
   const Triple &TT = M.getTargetTriple();
 
-  for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
-    if (Impl == RTLIB::Unsupported)
+  for (RTLIB::LibcallImpl Impl : RTLIB::libcall_impls()) {
+    if (!RTLCI.isAvailable(Impl))
       continue;
 
     auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index ec2e6c1ab796b..9c8b6ef83e56d 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -109,8 +110,12 @@ void LoopVersioning::versionLoop(
   // Insert the conditional branch based on the result of the memchecks.
   Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
   Builder.SetInsertPoint(OrigTerm);
-  Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
-                       VersionedLoop->getLoopPreheader());
+  auto *BI =
+      Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+                           VersionedLoop->getLoopPreheader());
+  // We don't know what the probability of executing the versioned vs the
+  // unversioned variants is.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*BI, DEBUG_TYPE);
   OrigTerm->eraseFromParent();
 
   // The loops merge in the original exit block.  This is now dominated by the
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 3a3e3ade20212..37c048f421f1a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5214,8 +5214,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
     // We don't have any info about this condition.
     auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
                              : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
-                                                DEBUG_TYPE);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, DEBUG_TYPE);
 
     OldTI->eraseFromParent();
 
@@ -7732,19 +7731,24 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
       // label. The other is those powers of 2 that don't appear in the case
       // statement. We don't know the distribution of the values coming in, so
       // the safest is to split 50-50 the original probability to `default`.
-      uint64_t OrigDenominator = sum_of(map_range(
-          Weights, [](const auto &V) { return static_cast<uint64_t>(V); }));
+      uint64_t OrigDenominator =
+          sum_of(map_range(Weights, StaticCastTo<uint64_t>));
       SmallVector<uint64_t> NewWeights(2);
       NewWeights[1] = Weights[0] / 2;
       NewWeights[0] = OrigDenominator - NewWeights[1];
       setFittedBranchWeights(*BI, NewWeights, /*IsExpected=*/false);
-
-      // For the original switch, we reduce the weight of the default by the
-      // amount by which the previous branch contributes to getting to default,
-      // and then make sure the remaining weights have the same relative ratio
-      // wrt eachother.
+      // The probability of executing the default block stays constant. It was
+      //  p_d = Weights[0] / OrigDenominator
+      //  we rewrite as W/D
+      // We want to find the probability of the default branch of the switch
+      // statement. Let's call it X. We have W/D = W/2D + X * (1-W/2D)
+      // i.e. the original probability is the probability we go to the default
+      // branch from the BI branch, or we take the default branch on the SI.
+      // Meaning X = W / (2D - W), or (W/2) / (D - W/2)
+      // This matches using W/2 for the default branch probability numerator and
+      // D-W/2 as the denominator.
+      Weights[0] = NewWeights[1];
       uint64_t CasesDenominator = OrigDenominator - Weights[0];
-      Weights[0] /= 2;
       for (auto &W : drop_begin(Weights))
         W = NewWeights[0] * static_cast<double>(W) / CasesDenominator;
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 0cd885e599817..e85e808921c87 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
@@ -1832,14 +1829,33 @@ entry:
 }
 
 define void @pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3) {
-; CHECK-LABEL: pmlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q2, [x1, #16]
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT:    str q0, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q2, [x1, #16]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT:    str q0, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q2, [x1, #16]
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
   %5 = getelementptr inbounds i32, ptr %3, i64 4
   %6 = load <8 x i16>, ptr %5, align 4
   %7 = trunc <8 x i16> %6 to <8 x i8>
@@ -1991,16 +2007,40 @@ define void @umlsl2_v4i32_uzp1(<8 x i16> %0, <4 x i32> %1, ptr %2, ptr %3) {
 }
 
 define void @pmlsl_pmlsl2_v8i16_uzp1(<16 x i8> %0, <8 x i16> %1, ptr %2, ptr %3, i32 %4) {
-; CHECK-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    pmull v3.8h, v0.8b, v2.8b
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.8h, v3.8h, v0.8h
-; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldp q2, q3, [x1]
+; CHECK-NEON-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    pmull v3.8h, v0.8b, v2.8b
+; CHECK-NEON-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    add v0.8h, v3.8h, v0.8h
+; CHECK-NEON-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-NEON-NEXT:    str q0, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldp q2, q3, [x1]
+; CHECK-SVE-NEXT:    uzp1 v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    pmull v3.8h, v0.8b, v2.8b
+; CHECK-SVE-NEXT:    pmull2 v0.8h, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    add v0.8h, v3.8h, v0.8h
+; CHECK-SVE-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-SVE-NEXT:    str q0, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmlsl_pmlsl2_v8i16_uzp1:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q2, q3, [x1]
+; CHECK-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v3.8b, v3.8h
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    pmull v2.8h, v4.8b, v3.8b
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sub v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %5 = load <8 x i16>, ptr %3, align 4
   %6 = trunc <8 x i16> %5 to <8 x i8>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 2a8b3ce2ae10b..8cb319b2c3368 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,11 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for test_vmull_p8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_p64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_vmull_high_p64
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon,+aes -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
@@ -2721,14 +2716,24 @@ entry:
 }
 
 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK-LABEL: test_vmull_p64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    mov x1, v0.d[1]
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vmull_p64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vmull_p64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
@@ -2736,12 +2741,22 @@ entry:
 }
 
 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK-LABEL: test_vmull_high_p64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT:    mov x1, v0.d[1]
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vmull_high_p64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    fmov x0, d0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vmull_high_p64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = extractelement <2 x i64> %a, i32 1
   %1 = extractelement <2 x i64> %b, i32 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index e6df9f2fb2c56..90abc7d389c13 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -2,44 +2,35 @@
 ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64-none-elf -mattr=+aes -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for pmull8h
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for commutable_pmull8h
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmulh_1s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_commuted_neg_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_4s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_4s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_indexed_2d_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmulh_lane_1s
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlal_lane_1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlsl_lane_1d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_dup_low
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_dup_high
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_duplane_low
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmull_from_extract_duplane_high
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlal_d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sqdmlsl_d
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_commutable_pmull_64
+; CHECK-GI:	 warning: Instruction selection used fallback path for sqdmulh_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_commuted_neg_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_indexed_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_4s_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmla_indexed_scalar_2d_strict
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmulh_lane_1s
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_lane_1d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for scalar_fmls_from_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v4f32_1
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fmls_with_fneg_before_extract_v2f64
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlal_d
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqdmlsl_d
 
 define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull8h:
@@ -2895,11 +2886,18 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) {
 }
 
 define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) {
-; CHECK-LABEL: pmull_from_extract_dup_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.16b, w0
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: pmull_from_extract_dup_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.16b, w0
+; CHECK-SD-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_dup_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.8b, w0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
   %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
@@ -2924,12 +2922,20 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs)
 }
 
 define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK-LABEL: pmull_from_extract_duplane_high:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    dup v1.16b, v1.b[0]
-; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: pmull_from_extract_duplane_high:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    dup v1.16b, v1.b[0]
+; CHECK-SD-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: pmull_from_extract_duplane_high:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    dup v1.8b, v1.b[0]
+; CHECK-GI-NEXT:    pmull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ret
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
@@ -3245,21 +3251,35 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
 }
 
 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_pmull_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_pmull_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    ret
   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   ret <16 x i8> %val
 }
 
 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
-; CHECK-LABEL: test_pmull_high_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_pmull_high_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_high_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d1, v1.d[1]
+; CHECK-GI-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    ret
   %l_hi = extractelement <2 x i64> %l, i32 1
   %r_hi = extractelement <2 x i64> %r, i32 1
   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
@@ -3267,13 +3287,22 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
 }
 
 define <16 x i8> @test_commutable_pmull_64(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: test_commutable_pmull_64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x1
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
-; CHECK-NEXT:    add v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_commutable_pmull_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x1
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-SD-NEXT:    add v0.16b, v0.16b, v0.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_commutable_pmull_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    fmov d1, x1
+; CHECK-GI-NEXT:    pmull v2.1q, v0.1d, v1.1d
+; CHECK-GI-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-GI-NEXT:    add v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    ret
   %1 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   %2 = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %r, i64 %l)
   %3 = add <16 x i8> %1, %2
diff --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
index df4889b6f09de..bd6c168ce8776 100644
--- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll
+++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes CHECK,CHECK-LE
 ; RUN: llc -mtriple=aarch64_be-unknown-linux-gnu < %s | FileCheck %s --check-prefix CHECK-BE
-; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
-
-; CHECK-GI:       warning: Instruction selection used fallback path for test_pmull_high_p8_128
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pmull_high_p8_64
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck %s --check-prefixes CHECK,CHECK-GI
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
@@ -521,12 +518,12 @@ entry:
 }
 
 define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
-; CHECK-LABEL: test_pmull_high_p8_128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x3
-; CHECK-NEXT:    fmov d1, x1
-; CHECK-NEXT:    pmull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-LE-LABEL: test_pmull_high_p8_128:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    fmov d0, x3
+; CHECK-LE-NEXT:    fmov d1, x1
+; CHECK-LE-NEXT:    pmull v0.8h, v1.8b, v0.8b
+; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_pmull_high_p8_128:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -538,6 +535,15 @@ define <8 x i16> @test_pmull_high_p8_128(i128 %aa, i128 %bb) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_pmull_high_p8_128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[0], x2
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    mov v1.d[1], x3
+; CHECK-GI-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %a = bitcast i128 %aa to <16 x i8>
   %b = bitcast i128 %bb to <16 x i8>
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 3685e9cf85bd6..b2635d3d9f1a5 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,6 +730,111 @@ entry:
   ret void
 }
 
+define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+                                     <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
+; CHECK-LABEL: store_factor8:
+; CHECK:       .Lfunc_begin17:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK:  zip1	[[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
+; CHECK-NEXT:  zip2	[[V5:.*s]], [[I1]], [[I5]]
+; CHECK-NEXT:  zip1	[[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
+; CHECK-NEXT:  zip2 [[V6:.*s]], [[I2]], [[I6]]
+; CHECK-NEXT:  zip1	[[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
+; CHECK-NEXT:  zip2	[[V7:.*s]], [[I3]], [[I7]]
+; CHECK-NEXT:  zip1	[[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
+; CHECK-NEXT:  zip2	[[V8:.*s]], [[I4]], [[I8]]
+; CHECK-NEXT:  st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
+; CHECK-NEXT:  st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
+; CHECK-NEXT:  ret
+
+  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  store <32 x i32> %interleaved.vec, ptr %ptr, align 4
+  ret void
+}
+
+define void @store_factor16(ptr %ptr, <4 x i32> %a0,  <4 x i32> %a1,  <4 x i32> %a2,  <4 x i32> %a3,
+                                      <4 x i32> %a4,  <4 x i32> %a5,  <4 x i32> %a6,  <4 x i32> %a7,
+                                      <4 x i32> %a8,  <4 x i32> %a9,  <4 x i32> %a10, <4 x i32> %a11,
+                                      <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
+; CHECK-LABEL: store_factor16:
+; CHECK:       .Lfunc_begin18:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK:      	zip1	[[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
+; CHECK-NEXT:  	zip1	[[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
+; CHECK-NEXT:  	zip1	[[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
+; CHECK-NEXT:  	zip1	[[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
+; CHECK-NEXT:  	zip1	[[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
+; CHECK-NEXT:  	zip2	[[V09:.*s]], [[I01]], [[I09]]
+; CHECK-NEXT:  	zip2	[[V13:.*s]], [[I05]], [[I13]]
+; CHECK-NEXT:  	zip1	[[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
+; CHECK-NEXT:  	zip1	[[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
+; CHECK-NEXT:  	zip1	[[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
+; CHECK-NEXT:  	zip2	[[V10:.*s]], [[I02]], [[I10]]
+; CHECK-NEXT:  	zip2	[[V14:.*s]], [[I06]], [[I14]]
+; CHECK-NEXT:  	zip2	[[V11:.*s]], [[I03]], [[I11]]
+; CHECK-NEXT:  	zip1	[[V17:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT:  	zip2	[[V15:.*s]], [[I07]], [[I15]]
+; CHECK-NEXT:  	zip2	[[V21:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT:  	zip1	[[V18:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT:  	zip2	[[V12:.*s]], [[I04]], [[I12]]
+; CHECK-NEXT:  	zip2	[[V16:.*s]], [[I08]], [[I16]]
+; CHECK-NEXT:  	zip1	[[V19:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT:  	zip2	[[V22:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT:  	zip1	[[V25:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT:  	zip1	[[V20:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT:  	zip2	[[V23:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT:  	zip1	[[V26:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT:  	zip2	[[V29:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT:  	zip2	[[V24:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT:  	zip1	[[V27:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT:  	zip2	[[V30:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT:  	zip1	[[V28:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT:  	zip2	[[V31:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT:  	zip2	[[V32:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT:  	st4	{ [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
+; CHECK-NEXT:  	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
+; CHECK-NEXT:  	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:  	st4	{ [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
+; CHECK-NEXT:  	add	x8, x0, #128
+; CHECK-NEXT:  	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
+; CHECK-NEXT:  	st4	{ [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
+; CHECK-NEXT:  	add	x8, x0, #192
+; CHECK-NEXT:  	st4	{ [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
+; CHECK-NEXT:  	ldp	d15, d14, [sp], #64             // 16-byte Folded Reload
+; CHECK-NEXT:  	ret
+
+  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+  %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32>  <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  store <64 x i32> %interleaved.vec, ptr %ptr, align 4
+  ret void
+}
+
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
index 4fa7c29bfde02..71005224dd1e5 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll
@@ -481,3 +481,15 @@ define void @dominance_not_in_program_order(ptr addrspace(7) inreg %arg) {
   %lsr.iv11 = phi ptr addrspace(7) [ %arg, %.loopexit ], [ %arg, %.preheader15 ]
   br label %.loopexit
 }
+
+;; iree-org/iree#22551 - crash on something that reduces to the below non-canonical select.
+define ptr addrspace(7) @noncanonical_const_cond(ptr addrspace(7) %x) {
+; CHECK-LABEL: define { ptr addrspace(8), i32 } @noncanonical_const_cond
+; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0
+; CHECK-NEXT:    [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1
+; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[RET]]
+;
+  %ret = select i1 false, ptr addrspace(7) %x, ptr addrspace(7) %x
+  ret ptr addrspace(7) %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index aab035f811434..b9bf13886d366 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -1,13 +1,49 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s
-
-; CHECK: {{^}}test:
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
-; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
-; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -check-prefix=R600
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s -check-prefix=CM
 
 define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+; R600-LABEL: test:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     ADD_INT T0.Y, KC0[3].X, 1,
+; R600-NEXT:     ADD_INT T0.Z, KC0[3].Y, 1,
+; R600-NEXT:     ADD_INT T0.W, KC0[2].Z, 1,
+; R600-NEXT:     ADD_INT * T1.W, KC0[2].W, 1,
+; R600-NEXT:     BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
+; R600-NEXT:     BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
+; R600-NEXT:     OR_INT T0.W, PV.W, PV.Z,
+; R600-NEXT:     OR_INT * T1.W, PV.Y, PV.X,
+; R600-NEXT:     OR_INT T0.X, PS, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: test:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    ALU clause starting at 4:
+; CM-NEXT:     ADD_INT T0.X, KC0[3].X, 1,
+; CM-NEXT:     ADD_INT T0.Y, KC0[3].Y, 1,
+; CM-NEXT:     ADD_INT T0.Z, KC0[2].Z, 1,
+; CM-NEXT:     ADD_INT * T0.W, KC0[2].W, 1,
+; CM-NEXT:     BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z,
+; CM-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z,
+; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
+; CM-NEXT:     OR_INT * T0.W, PV.Y, PV.X,
+; CM-NEXT:     OR_INT * T0.X, PV.W, PV.Z,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index cabd43edff9d6..9e243aec1128d 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
 }
 
 ; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
 entry:
   %sink = alloca i32, align 4
@@ -33,7 +32,7 @@ entry:
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
 ; CHECK-NEXT: .byte   1
 ;; Function Entry PC
-; CHECK-NEXT: .long   [[LABEL_FUNC]]
+; CHECK-NEXT: .long _ZL10myCallbacki
 ;; Function type ID -5212364466660467813
 ; CHECK-NEXT: .long	1154849691
 ; CHECK-NEXT: .long	3081369122
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index 3d3974ee6ba3b..8e8881ee722fb 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
 declare !type !2 ptr @direct_baz(ptr)
 
 ; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define ptr @ball() {
 entry:
   call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
 ;; Flags
 ; CHECK-NEXT: .byte   7
 ;; Function Entry PC
-; CHECK-NEXT: .long   [[LABEL_FUNC]]
+; CHECK-NEXT: .long ball
 ;; Function type ID -- set to 0 as no type metadata attached to function.
 ; CHECK-NEXT: .long   0
 ; CHECK-NEXT: .long   0
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
index 80360041c106a..35e570bdde405 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -29,6 +29,6 @@ declare !type !2 i32 @bar(i8 signext)
 
 ; CHECK:      Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
-; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
+; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05000000 00a150b8
 ;; Verify that the type id 0x308e4b8159bc8654 is in section.
 ; CHECK-NEXT: 0x00000020 3e0cfe3c b2015486 bc59814b 8e30
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
new file mode 100644
index 0000000000000..9016c5d7e8d44
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-errs.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm.ll 2>&1 | FileCheck %t/low-sm.ll
+; RUN: not opt -S --dxil-translate-metadata %t/low-sm-for-range.ll 2>&1 | FileCheck %t/low-sm-for-range.ll
+
+; Test that wavesize metadata is only allowed on applicable shader model versions
+
+;--- low-sm.ll
+
+; CHECK: Shader model 6.6 or greater is required to specify the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.5-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- low-sm-for-range.ll
+
+; CHECK: Shader model 6.8 or greater is required to specify wave size range values of the "hlsl.wavesize" function attribute
+
+target triple = "dxil-unknown-shadermodel6.7-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
new file mode 100644
index 0000000000000..3ad6c1d034252
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/wavesize-md-valid.ll
@@ -0,0 +1,96 @@
+; RUN: split-file %s %t
+; RUN: opt -S --dxil-translate-metadata %t/only.ll | FileCheck %t/only.ll
+; RUN: opt -S --dxil-translate-metadata %t/min.ll | FileCheck %t/min.ll
+; RUN: opt -S --dxil-translate-metadata %t/max.ll | FileCheck %t/max.ll
+; RUN: opt -S --dxil-translate-metadata %t/pref.ll | FileCheck %t/pref.ll
+
+; RUN: llc --filetype=obj %t/only.ll -o - | obj2yaml | FileCheck %t/only.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/min.ll -o - | obj2yaml | FileCheck %t/min.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/max.ll -o - | obj2yaml | FileCheck %t/max.ll --check-prefix=OBJ
+; RUN: llc --filetype=obj %t/pref.ll -o - | obj2yaml | FileCheck %t/pref.ll --check-prefix=OBJ
+
+; Test that wave size/range metadata is correctly generated with the correct tag
+
+;--- only.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 11, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.6-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- min.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 0, i32 0}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 16
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,0,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- max.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 32, i32 0}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 32
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,32,0" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+;--- pref.ll
+
+; CHECK: !dx.entryPoints = !{![[#ENTRY:]]}
+; CHECK: ![[#ENTRY]] = !{ptr @main, !"main", null, null, ![[#PROPS:]]}
+; CHECK: ![[#PROPS]] = !{{{.*}}i32 23, ![[#WAVE_SIZE:]]{{.*}}}
+; CHECK: ![[#WAVE_SIZE]] = !{i32 16, i32 64, i32 32}
+
+; OBJ: - Name:    PSV0
+; OBJ:   PSVInfo:
+; OBJ:     MinimumWaveLaneCount: 16
+; OBJ:     MaximumWaveLaneCount: 64
+
+target triple = "dxil-unknown-shadermodel6.8-compute"
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.wavesize"="16,64,32" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/PowerPC/vec_rounding.ll b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
index 2f16a435440ff..438c8ebdc099e 100644
--- a/llvm/test/CodeGen/PowerPC/vec_rounding.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_rounding.ll
@@ -1,172 +1,251 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
 
 ; Check vector round to single-precision toward -infinity (vrfim)
 ; instruction generation using Altivec.
 
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
 declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
 define <2 x double> @floor_v2f64(<2 x double> %p)
+; CHECK-LABEL: floor_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frim 1, 1
+; CHECK-NEXT:    frim 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: floor_v2f64:
-; CHECK: frim
-; CHECK: frim
 
 declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
 define <4 x double> @floor_v4f64(<4 x double> %p)
+; CHECK-LABEL: floor_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frim 1, 1
+; CHECK-NEXT:    frim 2, 2
+; CHECK-NEXT:    frim 3, 3
+; CHECK-NEXT:    frim 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: floor_v4f64:
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
-; CHECK: frim
 
 declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
 define <2 x double> @ceil_v2f64(<2 x double> %p)
+; CHECK-LABEL: ceil_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frip 1, 1
+; CHECK-NEXT:    frip 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: ceil_v2f64:
-; CHECK: frip
-; CHECK: frip
 
 declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
 define <4 x double> @ceil_v4f64(<4 x double> %p)
+; CHECK-LABEL: ceil_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    frip 1, 1
+; CHECK-NEXT:    frip 2, 2
+; CHECK-NEXT:    frip 3, 3
+; CHECK-NEXT:    frip 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: ceil_v4f64:
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
-; CHECK: frip
 
 declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
 define <2 x double> @trunc_v2f64(<2 x double> %p)
+; CHECK-LABEL: trunc_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    friz 1, 1
+; CHECK-NEXT:    friz 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: trunc_v2f64:
-; CHECK: friz
-; CHECK: friz
 
 declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
 define <4 x double> @trunc_v4f64(<4 x double> %p)
+; CHECK-LABEL: trunc_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    friz 1, 1
+; CHECK-NEXT:    friz 2, 2
+; CHECK-NEXT:    friz 3, 3
+; CHECK-NEXT:    friz 4, 4
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: trunc_v4f64:
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
-; CHECK: friz
 
 declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
-define <2 x double> @nearbyint_v2f64(<2 x double> %p)
+define <2 x double> @nearbyint_v2f64(<2 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -128(1)
+; CHECK-NEXT:    std 0, 144(1)
+; CHECK-NEXT:    stfd 30, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 31, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 31, 2
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 30, 1
+; CHECK-NEXT:    fmr 1, 31
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 2, 1
+; CHECK-NEXT:    fmr 1, 30
+; CHECK-NEXT:    lfd 31, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 30, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 128
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 {
   %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
   ret <2 x double> %t
 }
-; CHECK-LABEL: nearbyint_v2f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
 
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
-define <4 x double> @nearbyint_v4f64(<4 x double> %p)
+define <4 x double> @nearbyint_v4f64(<4 x double> %p) nounwind
+; CHECK-LABEL: nearbyint_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -144(1)
+; CHECK-NEXT:    std 0, 160(1)
+; CHECK-NEXT:    stfd 28, 112(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stfd 29, 120(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 29, 2
+; CHECK-NEXT:    stfd 30, 128(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 30, 3
+; CHECK-NEXT:    stfd 31, 136(1) # 8-byte Folded Spill
+; CHECK-NEXT:    fmr 31, 4
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 28, 1
+; CHECK-NEXT:    fmr 1, 29
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 29, 1
+; CHECK-NEXT:    fmr 1, 30
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 30, 1
+; CHECK-NEXT:    fmr 1, 31
+; CHECK-NEXT:    bl nearbyint
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    fmr 4, 1
+; CHECK-NEXT:    fmr 1, 28
+; CHECK-NEXT:    lfd 31, 136(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 28, 112(1) # 8-byte Folded Reload
+; CHECK-NEXT:    fmr 2, 29
+; CHECK-NEXT:    fmr 3, 30
+; CHECK-NEXT:    lfd 30, 128(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lfd 29, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 144
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 {
   %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
   ret <4 x double> %t
 }
-; CHECK-LABEL: nearbyint_v4f64:
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
-; CHECK: bl nearbyint
 
 
 declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
 define <4 x float> @floor_v4f32(<4 x float> %p)
+; CHECK-LABEL: floor_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfim 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: floor_v4f32:
-; CHECK: vrfim
 
 declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
 define <8 x float> @floor_v8f32(<8 x float> %p)
+; CHECK-LABEL: floor_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfim 2, 2
+; CHECK-NEXT:    vrfim 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: floor_v8f32:
-; CHECK: vrfim
-; CHECK: vrfim
 
 declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
 define <4 x float> @ceil_v4f32(<4 x float> %p)
+; CHECK-LABEL: ceil_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfip 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: ceil_v4f32:
-; CHECK: vrfip
 
 declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
 define <8 x float> @ceil_v8f32(<8 x float> %p)
+; CHECK-LABEL: ceil_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfip 2, 2
+; CHECK-NEXT:    vrfip 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: ceil_v8f32:
-; CHECK: vrfip
-; CHECK: vrfip
 
 declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
 define <4 x float> @trunc_v4f32(<4 x float> %p)
+; CHECK-LABEL: trunc_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfiz 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: trunc_v4f32:
-; CHECK: vrfiz
 
 declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
 define <8 x float> @trunc_v8f32(<8 x float> %p)
+; CHECK-LABEL: trunc_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfiz 2, 2
+; CHECK-NEXT:    vrfiz 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: trunc_v8f32:
-; CHECK: vrfiz
-; CHECK: vrfiz
 
 declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
 define <4 x float> @nearbyint_v4f32(<4 x float> %p)
+; CHECK-LABEL: nearbyint_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfin 2, 2
+; CHECK-NEXT:    blr
 {
   %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
   ret <4 x float> %t
 }
-; CHECK-LABEL: nearbyint_v4f32:
-; CHECK: vrfin
 
 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
 define <8 x float> @nearbyint_v8f32(<8 x float> %p)
+; CHECK-LABEL: nearbyint_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrfin 2, 2
+; CHECK-NEXT:    vrfin 3, 3
+; CHECK-NEXT:    blr
 {
   %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
-; CHECK-LABEL: nearbyint_v8f32:
-; CHECK: vrfin
-; CHECK: vrfin
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index f36baba402421..ab8498d8d3451 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
 }
 
 ; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
 entry:
   %sink = alloca i32, align 4
@@ -33,6 +32,6 @@ entry:
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
 ; CHECK-NEXT: .byte   1
 ;; Function Entry PC
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+; CHECK-NEXT: .quad   _ZL10myCallbacki
 ;; Function type ID
 ; CHECK-NEXT: .quad   -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index cdbad668aec54..02d71073b65c5 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
 declare !type !2 ptr @direct_baz(ptr)
 
 ; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
 define ptr @ball() {
 entry:
   call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
 ;; Flags
 ; CHECK-NEXT: .byte   7
 ;; Function Entry PC
-; CHECK-NEXT: .quad   [[LABEL_FUNC]]
+; CHECK-NEXT: .quad ball
 ;; Function type ID -- set to 0 as no type metadata attached to function.
 ; CHECK-NEXT: .quad   0
 ;; Number of unique direct callees.
diff --git a/llvm/test/CodeGen/Xtensa/s32c1i.ll b/llvm/test/CodeGen/Xtensa/s32c1i.ll
new file mode 100644
index 0000000000000..aad738abe6a4c
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/s32c1i.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=xtensa -mattr=+s32c1i  -filetype=obj %s -o - | llvm-objdump --arch=xtensa --mattr=s32c1i -d - | FileCheck %s -check-prefix=XTENSA
+
+define i32 @constraint_i(i32 %a) {
+; XTENSA: 0: 22 e2 01    s32c1i  a2, a2, 4
+  %res = tail call i32 asm "s32c1i $0, $1, $2", "=r,r,i"(i32 %a, i32 4)
+  ret i32 %res
+}
diff --git a/llvm/test/Demangle/ms-operators.test b/llvm/test/Demangle/ms-operators.test
index b940488786631..cafa1ae3c0663 100644
--- a/llvm/test/Demangle/ms-operators.test
+++ b/llvm/test/Demangle/ms-operators.test
@@ -143,9 +143,24 @@
 ??_7A@B@@6BC@D@@@
 ; CHECK: const B::A::`vftable'{for `D::C'}
 
+??_7A@B@@6BC@D@@E@F@@@
+; CHECK: const B::A::`vftable'{for `D::C's `F::E'}
+
+??_7A@B@@6BC@D@@E@F@@G@H@@@
+; CHECK: const B::A::`vftable'{for `D::C's `F::E's `H::G'}
+
 ??_8Middle2@@7B@
 ; CHECK: const Middle2::`vbtable'
 
+??_7A@@6BB@@@
+; CHECK: const A::`vftable'{for `B'}
+
+??_7A@@6BB@@C@@@
+; CHECK: const A::`vftable'{for `B's `C'}
+
+??_7A@@6BB@@C@@D@@@
+; CHECK: const A::`vftable'{for `B's `C's `D'}
+
 ??_9Base@@$B7AA
 ; CHECK: [thunk]: __cdecl Base::`vcall'{8, {flat}}
 
diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s
index 6e84e9132a55d..567d41df6d9ef 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_err.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_err.s
@@ -1,5 +1,5 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A --implicit-check-not=error: %s
-
+// XFAIL: *
 ds_add_src2_u32 v1
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
@@ -239,3 +239,481 @@ scratch_load_lds_dword v2, off
 
 ds_read_b32 v0, v1 gds
 // GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: gds modifier is not supported on this GPU
+
+// op_sel not allowed in dot opcodes with 4- or 8-bit packed data
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4_u32_u8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot4c_i32_i8 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8_u32_u4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[0,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,0]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_dot8c_i32_i4 v0, v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+// nv bit in FLAT instructions
+flat_load_ubyte v5, v[2:3] offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_load_ubyte a5, v[2:3] offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_store_dword v[2:3], v5 offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_store_dword v[2:3], a5 offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_load_ubyte v5, v[2:3], off offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_store_byte v[2:3], v5, off offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_add v[2:3], v5, off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap a1, v[2:3], a2, off glc nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap_x2 v[2:3], v[4:5], off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap_x2 v[2:3], a[4:5], off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_load_ubyte v5, off, s2 offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_load_ubyte a5, off, s2 offset:-1 nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_store_dword v2, v3, off nv
+// GFX90A: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index c96a72ddc2573..3af0d83fb3056 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -706,107 +706,107 @@ flat_load_short_d16_hi a5, v[2:3] offset:4095 glc
 flat_load_short_d16_hi a5, v[2:3] offset:4095 slc
 
 // GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_swap a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_add a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_sub a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umin a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umax a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_and a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_or a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_xor a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_inc a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_dec a0, v[2:3], a2 offset:4095 glc
 
 // GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc
 
 // GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc
 
 // GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx942_err.s b/llvm/test/MC/AMDGPU/gfx942_err.s
index fd59a01b34a04..dc51bab65aa04 100644
--- a/llvm/test/MC/AMDGPU/gfx942_err.s
+++ b/llvm/test/MC/AMDGPU/gfx942_err.s
@@ -125,3 +125,31 @@ global_load_dword v[2:3], off lds
 
 scratch_load_dword v2, off lds
 // GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// nv bit in FLAT instructions
+flat_load_ubyte v5, v[2:3] offset:4095 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_store_dword v[2:3], v5 offset:4095 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+flat_atomic_add_f32 v[2:3], v5 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_load_dword v2, v[2:3], off sc0 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_store_dword v[2:3], v5 off sc0 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_add_f64 v[0:1], v[2:3], off sc1 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+global_atomic_swap v0, v[2:3], v5 off sc0 nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_load_lds_dword v2, off nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
+
+scratch_store_dword v2, v3, off nv
+// GFX942: :[[@LINE-1]]:{{[0-9]+}}: error: nv is not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
index 5cc3d2533a149..7687c0a478bd9 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_flat.s
@@ -24,6 +24,18 @@ flat_load_ubyte v5, v[1:2] offset:4095 glc
 flat_load_ubyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ubyte v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sbyte v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 
@@ -48,6 +60,18 @@ flat_load_sbyte v5, v[1:2] offset:4095 glc
 flat_load_sbyte v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sbyte v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_ushort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 
@@ -72,6 +96,18 @@ flat_load_ushort v5, v[1:2] offset:4095 glc
 flat_load_ushort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ushort v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ushort v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sshort v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -96,6 +132,18 @@ flat_load_sshort v5, v[1:2] offset:4095 glc
 flat_load_sshort v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sshort v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sshort v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dword v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 
@@ -120,6 +168,18 @@ flat_load_dword v5, v[1:2] offset:4095 glc
 flat_load_dword v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dword v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dword v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 
@@ -144,6 +204,18 @@ flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc
 flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dwordx2 v[5:6], v[1:2] nv
+// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 
@@ -168,6 +240,18 @@ flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc
 flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dwordx3 v[5:7], v[1:2] nv
+// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -192,6 +276,18 @@ flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc
 flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_dwordx4 v[5:8], v[1:2] nv
+// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
+
 flat_store_byte v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 
@@ -216,6 +312,18 @@ flat_store_byte v[1:2], v2 offset:4095 glc
 flat_store_byte v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_byte v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_byte_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 
@@ -240,6 +348,18 @@ flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_byte_d16_hi v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_short v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 
@@ -264,6 +384,18 @@ flat_store_short v[1:2], v2 offset:4095 glc
 flat_store_short v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_short v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_short_d16_hi v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -288,6 +420,18 @@ flat_store_short_d16_hi v[1:2], v2 offset:4095 glc
 flat_store_short_d16_hi v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_short_d16_hi v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dword v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 
@@ -312,6 +456,18 @@ flat_store_dword v[1:2], v2 offset:4095 glc
 flat_store_dword v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dword v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dword v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 
@@ -336,6 +492,18 @@ flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc
 flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dwordx2 v[1:2], v[2:3] nv
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv
+// CHECK: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095
 // CHECK: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 
@@ -360,6 +528,18 @@ flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc
 flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dwordx3 v[1:2], v[2:4] nv
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv
+// CHECK: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
+
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095
 // CHECK: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 
@@ -384,6 +564,18 @@ flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc
 flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc
 // CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 
+flat_store_dwordx4 v[1:2], v[2:5] nv
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv
+// CHECK: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
+
+flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
+
 flat_load_ubyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 
@@ -408,6 +600,18 @@ flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ubyte_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 
@@ -432,6 +636,18 @@ flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_ubyte_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sbyte_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 
@@ -456,6 +672,18 @@ flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sbyte_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 
@@ -480,6 +708,18 @@ flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_sbyte_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_short_d16 v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 
@@ -504,6 +744,18 @@ flat_load_short_d16 v5, v[1:2] offset:4095 glc
 flat_load_short_d16 v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_short_d16 v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16 v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
+
 flat_load_short_d16_hi v5, v[1:2] offset:4095
 // CHECK: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 
@@ -528,6 +780,18 @@ flat_load_short_d16_hi v5, v[1:2] offset:4095 glc
 flat_load_short_d16_hi v5, v[1:2] offset:4095 slc
 // CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 
+flat_load_short_d16_hi v5, v[1:2] nv
+// CHECK: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:7 nv
+// CHECK: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
+
+flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
+
 flat_atomic_swap v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 
@@ -552,6 +816,18 @@ flat_atomic_swap v0, v[1:2], v2 offset:4095 glc
 flat_atomic_swap v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 
+flat_atomic_swap v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_swap v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
+
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095
 // CHECK: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 
@@ -576,6 +852,18 @@ flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc
 flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc
 // CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 
+flat_atomic_cmpswap v[1:2], v[2:3] nv
+// CHECK: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv
+// CHECK: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
+
 flat_atomic_add v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 
@@ -600,6 +888,18 @@ flat_atomic_add v0, v[1:2], v2 offset:4095 glc
 flat_atomic_add v[1:2], v2 offset:4095 slc
 // CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 
+flat_atomic_add v[1:2], v2 nv
+// CHECK: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_add v[1:2], v2 offset:7 nv
+// CHECK: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv
+// CHECK: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
+
+flat_atomic_add v[1:2], v2 offset:4095 slc nv
+// CHECK: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
+
 flat_atomic_sub v[1:2], v2 offset:4095
 // CHECK: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 
@@ -1197,6 +1497,18 @@ global_load_ubyte v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ubyte v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sbyte v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1242,6 +1554,18 @@ global_load_sbyte v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sbyte v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_ushort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1287,6 +1611,18 @@ global_load_ushort v5, v1, s[4:5] offset:-1 glc
 global_load_ushort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ushort v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ushort v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ushort v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ushort v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sshort v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1332,6 +1668,18 @@ global_load_sshort v5, v1, s[4:5] offset:-1 glc
 global_load_sshort v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sshort v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sshort v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sshort v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sshort v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dword v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1377,6 +1725,18 @@ global_load_dword v5, v1, s[4:5] offset:-1 glc
 global_load_dword v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dword v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dword v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dword v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dword v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1422,6 +1782,18 @@ global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc
 global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dwordx2 v[5:6], v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1467,6 +1839,15 @@ global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc
 global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dwordx3 v[5:7], v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1512,6 +1893,15 @@ global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc
 global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_dwordx4 v[5:8], v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
+global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
+
 global_store_byte v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1557,6 +1947,18 @@ global_store_byte v1, v2, s[6:7] offset:-1 glc
 global_store_byte v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_byte v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1602,6 +2004,18 @@ global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_byte_d16_hi v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_short v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1647,6 +2061,18 @@ global_store_short v1, v2, s[6:7] offset:-1 glc
 global_store_short v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_short v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1692,6 +2118,18 @@ global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc
 global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_short_d16_hi v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dword v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1737,6 +2175,18 @@ global_store_dword v1, v2, s[6:7] offset:-1 glc
 global_store_dword v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dword v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dword v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dword v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dword v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1782,6 +2232,18 @@ global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc
 global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dwordx2 v1, v[2:3], s[6:7] nv
+// CHECK: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1827,6 +2289,18 @@ global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc
 global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dwordx3 v1, v[2:4], s[6:7] nv
+// CHECK: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
+
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x06,0x00]
 
@@ -1872,6 +2346,18 @@ global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc
 global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x06,0x00]
 
+global_store_dwordx4 v1, v[2:5], s[6:7] nv
+// CHECK: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
+
+global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
+
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1917,6 +2403,18 @@ global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ubyte_d16 v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x04,0x05]
 
@@ -1962,6 +2460,18 @@ global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_ubyte_d16_hi v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2007,6 +2517,18 @@ global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sbyte_d16 v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2052,6 +2574,18 @@ global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_sbyte_d16_hi v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_short_d16 v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2097,6 +2631,18 @@ global_load_short_d16 v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16 v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_short_d16 v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16 v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
+
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1
 // CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x04,0x05]
 
@@ -2142,6 +2688,18 @@ global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc
 global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc
 // CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x04,0x05]
 
+global_load_short_d16_hi v5, v1, s[4:5] nv
+// CHECK: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv
+// CHECK: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
+
+global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
+
 global_atomic_swap v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2187,6 +2745,18 @@ global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_swap v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x06,0x00]
 
+global_atomic_swap v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_swap v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
+
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2232,6 +2802,18 @@ global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc
 global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x06,0x00]
 
+global_atomic_cmpswap v1, v[2:3], s[6:7] nv
+// CHECK: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
+
 global_atomic_add v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x06,0x00]
 
@@ -2277,6 +2859,18 @@ global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc
 global_atomic_add v1, v2, s[6:7] offset:-1 slc
 // CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x06,0x00]
 
+global_atomic_add v1, v2, s[6:7] nv
+// CHECK: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_add v1, v2, s[6:7] offset:-1 nv
+// CHECK: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv
+// CHECK: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
+
+global_atomic_add v1, v2, s[6:7] offset:-1 slc nv
+// CHECK: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
+
 global_atomic_sub v1, v2, s[6:7] offset:-1
 // CHECK: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x06,0x00]
 
@@ -3357,6 +3951,18 @@ scratch_load_ubyte v5, off, s2 offset:-1 glc
 scratch_load_ubyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ubyte v5, off, s2 nv
+// CHECK: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sbyte v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3402,6 +4008,18 @@ scratch_load_sbyte v5, off, s2 offset:-1 glc
 scratch_load_sbyte v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sbyte v5, off, s2 nv
+// CHECK: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_ushort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3447,6 +4065,18 @@ scratch_load_ushort v5, off, s2 offset:-1 glc
 scratch_load_ushort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ushort v5, off, s2 nv
+// CHECK: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ushort v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ushort v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ushort v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sshort v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3492,6 +4122,18 @@ scratch_load_sshort v5, off, s2 offset:-1 glc
 scratch_load_sshort v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sshort v5, off, s2 nv
+// CHECK: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sshort v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sshort v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sshort v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dword v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3537,6 +4179,18 @@ scratch_load_dword v5, off, s2 offset:-1 glc
 scratch_load_dword v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dword v5, off, s2 nv
+// CHECK: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dword v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dword v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dword v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3582,6 +4236,18 @@ scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc
 scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dwordx2 v[5:6], off, s2 nv
+// CHECK: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3627,6 +4293,18 @@ scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc
 scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dwordx3 v[5:7], off, s2 nv
+// CHECK: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -3672,6 +4350,18 @@ scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc
 scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_dwordx4 v[5:8], off, s2 nv
+// CHECK: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_store_byte off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3717,6 +4407,18 @@ scratch_store_byte off, v2, s3 offset:-1 glc
 scratch_store_byte off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_byte off, v2, s3 nv
+// CHECK: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_byte_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3762,6 +4464,18 @@ scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_byte_d16_hi off, v2, s3 nv
+// CHECK: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_short off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3807,6 +4521,18 @@ scratch_store_short off, v2, s3 offset:-1 glc
 scratch_store_short off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_short off, v2, s3 nv
+// CHECK: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_short_d16_hi off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3852,6 +4578,18 @@ scratch_store_short_d16_hi off, v2, s3 offset:-1 glc
 scratch_store_short_d16_hi off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_short_d16_hi off, v2, s3 nv
+// CHECK: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short_d16_hi off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dword off, v2, s3 offset:-1
 // CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3897,6 +4635,18 @@ scratch_store_dword off, v2, s3 offset:-1 glc
 scratch_store_dword off, v2, s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dword off, v2, s3 nv
+// CHECK: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dword off, v2, s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dword off, v2, s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dword off, v2, s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1
 // CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3942,6 +4692,18 @@ scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc
 scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dwordx2 off, v[2:3], s3 nv
+// CHECK: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1
 // CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 
@@ -3987,6 +4749,18 @@ scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc
 scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dwordx3 off, v[2:4], s3 nv
+// CHECK: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1
 // CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 
@@ -4032,6 +4806,18 @@ scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc
 scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc
 // CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 
+scratch_store_dwordx4 off, v[2:5], s3 nv
+// CHECK: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv
+// CHECK: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
+
+scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
+
 scratch_load_ubyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4077,6 +4863,18 @@ scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ubyte_d16 v5, off, s2 nv
+// CHECK: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4122,6 +4920,18 @@ scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_ubyte_d16_hi v5, off, s2 nv
+// CHECK: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sbyte_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4167,6 +4977,18 @@ scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sbyte_d16 v5, off, s2 nv
+// CHECK: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4212,6 +5034,18 @@ scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_sbyte_d16_hi v5, off, s2 nv
+// CHECK: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_short_d16 v5, off, s2 offset:-1
 // CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4254,6 +5088,18 @@ scratch_load_short_d16 v5, off, s2 offset:-4096
 scratch_load_short_d16 v5, off, s2 offset:-1 glc
 // CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_short_d16 v5, off, s2 nv
+// CHECK: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16 v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16 v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16 v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
+
 scratch_load_short_d16 v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 
@@ -4302,6 +5148,18 @@ scratch_load_short_d16_hi v5, off, s2 offset:-1 glc
 scratch_load_short_d16_hi v5, off, s2 offset:-1 slc
 // CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 
+scratch_load_short_d16_hi v5, off, s2 nv
+// CHECK: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16_hi v5, off, s2 offset:-1 nv
+// CHECK: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv
+// CHECK: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
+
+scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv
+// CHECK: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
+
 global_load_dword v[2:3], off lds
 // CHECK: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
index 0ee659e207c91..4c06585a4c2eb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_flat.txt
@@ -21,6 +21,18 @@
 # CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x42,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ubyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x40,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x41,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x42,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x44,0xdc,0x01,0x00,0x00,0x05
 
@@ -42,6 +54,18 @@
 # CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x46,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sbyte v5, v[1:2] nv           ; encoding: [0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x44,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x45,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x46,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x48,0xdc,0x01,0x00,0x00,0x05
 
@@ -63,6 +87,18 @@
 # CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4a,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ushort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ushort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x48,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ushort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x49,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ushort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x4a,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4c,0xdc,0x01,0x00,0x00,0x05
 
@@ -84,6 +120,18 @@
 # CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x4e,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sshort v5, v[1:2] nv          ; encoding: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sshort v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x4c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sshort v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x4d,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sshort v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x4e,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dword v5, v[1:2] offset:4095  ; encoding: [0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x50,0xdc,0x01,0x00,0x00,0x05
 
@@ -105,6 +153,18 @@
 # CHECK: flat_load_dword v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x52,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dword v5, v[1:2] nv           ; encoding: [0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dword v5, v[1:2] offset:7 nv  ; encoding: [0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x50,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dword v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x51,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dword v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x52,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x54,0xdc,0x01,0x00,0x00,0x05
 
@@ -126,6 +186,18 @@
 # CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x56,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] nv     ; encoding: [0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x54,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x55,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx2 v[5:6], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x56,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x58,0xdc,0x01,0x00,0x00,0x05
 
@@ -147,6 +219,18 @@
 # CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5a,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] nv     ; encoding: [0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x58,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x59,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx3 v[5:7], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x5a,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5c,0xdc,0x01,0x00,0x00,0x05
 
@@ -168,6 +252,18 @@
 # CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x5e,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] nv     ; encoding: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x5c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x5d,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_dwordx4 v[5:8], v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x5e,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_store_byte v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x60,0xdc,0x01,0x02,0x00,0x00
 
@@ -189,6 +285,18 @@
 # CHECK: flat_store_byte v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x62,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_byte v[1:2], v2 nv           ; encoding: [0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x60,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x61,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x62,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x64,0xdc,0x01,0x02,0x00,0x00
 
@@ -210,6 +318,18 @@
 # CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x66,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 nv    ; encoding: [0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x64,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x65,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_byte_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x66,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_short v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x68,0xdc,0x01,0x02,0x00,0x00
 
@@ -231,6 +351,18 @@
 # CHECK: flat_store_short v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6a,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_short v[1:2], v2 nv          ; encoding: [0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x68,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x69,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x6a,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6c,0xdc,0x01,0x02,0x00,0x00
 
@@ -252,6 +384,18 @@
 # CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x6e,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_short_d16_hi v[1:2], v2 nv   ; encoding: [0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x6c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x6d,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_short_d16_hi v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x6e,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x70,0xdc,0x01,0x02,0x00,0x00
 
@@ -273,6 +417,18 @@
 # CHECK: flat_store_dword v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x72,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dword v[1:2], v2 nv          ; encoding: [0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dword v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x70,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dword v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x71,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dword v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x72,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x74,0xdc,0x01,0x02,0x00,0x00
 
@@ -294,6 +450,18 @@
 # CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x76,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] nv    ; encoding: [0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x74,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x75,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx2 v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x76,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x78,0xdc,0x01,0x02,0x00,0x00
 
@@ -315,6 +483,18 @@
 # CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7a,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] nv    ; encoding: [0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:7 nv ; encoding: [0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x78,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 glc nv ; encoding: [0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x79,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx3 v[1:2], v[2:4] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x7a,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7c,0xdc,0x01,0x02,0x00,0x00
 
@@ -336,6 +516,18 @@
 # CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x7e,0xdc,0x01,0x02,0x00,0x00
 
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] nv    ; encoding: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+0x00,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:7 nv ; encoding: [0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00]
+0x07,0x00,0x7c,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 glc nv ; encoding: [0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x7d,0xdc,0x01,0x02,0x80,0x00
+
+# CHECK: flat_store_dwordx4 v[1:2], v[2:5] offset:4095 slc nv ; encoding: [0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x7e,0xdc,0x01,0x02,0x80,0x00
+
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x80,0xdc,0x01,0x00,0x00,0x05
 
@@ -357,6 +549,18 @@
 # CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x82,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x80,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x81,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x82,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x84,0xdc,0x01,0x00,0x00,0x05
 
@@ -378,6 +582,18 @@
 # CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x86,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x84,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x85,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_ubyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x86,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x88,0xdc,0x01,0x00,0x00,0x05
 
@@ -399,6 +615,18 @@
 # CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8a,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x88,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x89,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x8a,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8c,0xdc,0x01,0x00,0x00,0x05
 
@@ -420,6 +648,18 @@
 # CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x8e,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x8c,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x8d,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_sbyte_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x8e,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x90,0xdc,0x01,0x00,0x00,0x05
 
@@ -441,6 +681,18 @@
 # CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x92,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_short_d16 v5, v[1:2] nv       ; encoding: [0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16 v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x90,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x91,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16 v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x92,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x94,0xdc,0x01,0x00,0x00,0x05
 
@@ -462,6 +714,18 @@
 # CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05]
 0xff,0x0f,0x96,0xdc,0x01,0x00,0x00,0x05
 
+# CHECK: flat_load_short_d16_hi v5, v[1:2] nv    ; encoding: [0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+0x00,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:7 nv ; encoding: [0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05]
+0x07,0x00,0x94,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 glc nv ; encoding: [0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x95,0xdc,0x01,0x00,0x80,0x05
+
+# CHECK: flat_load_short_d16_hi v5, v[1:2] offset:4095 slc nv ; encoding: [0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05]
+0xff,0x0f,0x96,0xdc,0x01,0x00,0x80,0x05
+
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x00,0xdd,0x01,0x02,0x00,0x00
 
@@ -483,6 +747,18 @@
 # CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x02,0xdd,0x01,0x02,0x00,0x00
 
+# CHECK: flat_atomic_swap v[1:2], v2 nv          ; encoding: [0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+0x00,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_swap v[1:2], v2 offset:7 nv ; encoding: [0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00]
+0x07,0x00,0x00,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_swap v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x01,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_swap v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x02,0xdd,0x01,0x02,0x80,0x00
+
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x04,0xdd,0x01,0x02,0x00,0x00
 
@@ -504,6 +780,18 @@
 # CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x06,0xdd,0x01,0x02,0x00,0x00
 
+# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] nv   ; encoding: [0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+0x00,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:7 nv ; encoding: [0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00]
+0x07,0x00,0x04,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_cmpswap v0, v[1:2], v[2:3] offset:4095 glc nv ; encoding: [0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x05,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_cmpswap v[1:2], v[2:3] offset:4095 slc nv ; encoding: [0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x06,0xdd,0x01,0x02,0x80,0x00
+
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x08,0xdd,0x01,0x02,0x00,0x00
 
@@ -525,6 +813,18 @@
 # CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0a,0xdd,0x01,0x02,0x00,0x00
 
+# CHECK: flat_atomic_add v[1:2], v2 nv           ; encoding: [0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+0x00,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_add v[1:2], v2 offset:7 nv  ; encoding: [0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00]
+0x07,0x00,0x08,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_add v0, v[1:2], v2 offset:4095 glc nv ; encoding: [0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x09,0xdd,0x01,0x02,0x80,0x00
+
+# CHECK: flat_atomic_add v[1:2], v2 offset:4095 slc nv ; encoding: [0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00]
+0xff,0x0f,0x0a,0xdd,0x01,0x02,0x80,0x00
+
 # CHECK: flat_atomic_sub v[1:2], v2 offset:4095  ; encoding: [0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00]
 0xff,0x0f,0x0c,0xdd,0x01,0x02,0x00,0x00
 
@@ -1017,6 +1317,18 @@
 # CHECK: global_load_ubyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x40,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ubyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x40,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x40,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x41,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x42,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sbyte v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x44,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1026,6 +1338,18 @@
 # CHECK: global_load_sbyte v5, v[1:2], off       ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x44,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sbyte v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x44,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x44,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x45,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x46,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_ushort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x48,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1035,6 +1359,18 @@
 # CHECK: global_load_ushort v5, v[1:2], off      ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x48,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ushort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x48,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x48,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x49,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ushort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4a,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sshort v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1044,6 +1380,18 @@
 # CHECK: global_load_sshort v5, v[1:2], off      ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x4c,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sshort v5, v1, s[4:5] nv    ; encoding: [0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x4c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4d,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sshort v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x4e,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dword v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x50,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1053,6 +1401,18 @@
 # CHECK: global_load_dword v5, v[1:2], off       ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x50,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dword v5, v1, s[4:5] nv     ; encoding: [0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x50,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x50,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x51,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dword v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x52,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x54,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1062,6 +1422,18 @@
 # CHECK: global_load_dwordx2 v[5:6], v[1:2], off ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x54,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] nv ; encoding: [0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x54,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x54,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x55,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx2 v[5:6], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x56,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x58,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1071,6 +1443,18 @@
 # CHECK: global_load_dwordx3 v[5:7], v[1:2], off ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x58,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] nv ; encoding: [0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x58,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x58,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x59,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx3 v[5:7], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5a,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1080,6 +1464,18 @@
 # CHECK: global_load_dwordx4 v[5:8], v[1:2], off ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x5c,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] nv ; encoding: [0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x5c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5d,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_dwordx4 v[5:8], v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x5e,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_store_byte v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x60,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1089,6 +1485,18 @@
 # CHECK: global_store_byte v[1:2], v2, off       ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x60,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_byte v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x60,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x60,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x61,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x62,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x64,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1098,6 +1506,18 @@
 # CHECK: global_store_byte_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x64,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x64,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x64,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x65,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_byte_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x66,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_short v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x68,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1107,6 +1527,18 @@
 # CHECK: global_store_short v[1:2], v2, off      ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x68,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_short v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x68,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x68,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x69,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6a,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_short_d16_hi v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1116,6 +1548,18 @@
 # CHECK: global_store_short_d16_hi v[1:2], v2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x6c,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] nv ; encoding: [0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x6c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6d,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_short_d16_hi v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x6e,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dword v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x70,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1125,6 +1569,18 @@
 # CHECK: global_store_dword v[1:2], v2, off      ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x70,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dword v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x70,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x70,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x71,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dword v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x72,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x74,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1134,6 +1590,18 @@
 # CHECK: global_store_dwordx2 v[1:2], v[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x74,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x74,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x74,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x75,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx2 v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x76,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x78,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1143,6 +1611,18 @@
 # CHECK: global_store_dwordx3 v[1:2], v[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x78,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] nv ; encoding: [0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x78,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x78,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x79,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx3 v1, v[2:4], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7a,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
@@ -1152,6 +1632,18 @@
 # CHECK: global_store_dwordx4 v[1:2], v[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x7c,0xdc,0x01,0x02,0x7f,0x00
 
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] nv ; encoding: [0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00]
+0x00,0x80,0x7c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7c,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7d,0xdc,0x01,0x02,0x86,0x00
+
+# CHECK: global_store_dwordx4 v1, v[2:5], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x7e,0xdc,0x01,0x02,0x86,0x00
+
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x80,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1161,6 +1653,18 @@
 # CHECK: global_load_ubyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x80,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x80,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x80,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x81,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x82,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x84,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1170,6 +1674,18 @@
 # CHECK: global_load_ubyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x84,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x84,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x84,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x85,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_ubyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x86,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x88,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1179,6 +1695,18 @@
 # CHECK: global_load_sbyte_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x88,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x88,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x88,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x89,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8a,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1188,6 +1716,18 @@
 # CHECK: global_load_sbyte_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x8c,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x8c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8c,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8d,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_sbyte_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x8e,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_short_d16 v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x90,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1197,6 +1737,18 @@
 # CHECK: global_load_short_d16 v5, v[1:2], off   ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x90,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_short_d16 v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x90,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x90,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x91,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16 v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x92,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_load_short_d16_hi v5, v[1:2], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0xff,0x9f,0x94,0xdc,0x01,0x00,0x7f,0x05
 
@@ -1206,6 +1758,18 @@
 # CHECK: global_load_short_d16_hi v5, v[1:2], off ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05]
 0x00,0x80,0x94,0xdc,0x01,0x00,0x7f,0x05
 
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] nv ; encoding: [0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05]
+0x00,0x80,0x94,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 nv ; encoding: [0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x94,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 glc nv ; encoding: [0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x95,0xdc,0x01,0x00,0x84,0x05
+
+# CHECK: global_load_short_d16_hi v5, v1, s[4:5] offset:-1 slc nv ; encoding: [0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05]
+0xff,0x9f,0x96,0xdc,0x01,0x00,0x84,0x05
+
 # CHECK: global_atomic_swap v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x00,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1215,6 +1779,18 @@
 # CHECK: global_atomic_swap v[1:2], v2, off      ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x00,0xdd,0x01,0x02,0x7f,0x00
 
+# CHECK: global_atomic_swap v1, v2, s[6:7] nv    ; encoding: [0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00]
+0x00,0x80,0x00,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x00,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_swap v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x01,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_swap v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x02,0xdd,0x01,0x02,0x86,0x00
+
 # CHECK: global_atomic_cmpswap v[1:2], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x04,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1236,6 +1812,18 @@
 # CHECK: global_atomic_cmpswap v1, v[2:3], v[4:5], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01]
 0x00,0x80,0x05,0xdd,0x02,0x04,0x7f,0x01
 
+# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] nv ; encoding: [0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00]
+0x00,0x80,0x04,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x04,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_cmpswap v0, v1, v[2:3], s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x05,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_cmpswap v1, v[2:3], s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x06,0xdd,0x01,0x02,0x86,0x00
+
 # CHECK: global_atomic_add v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x08,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1245,6 +1833,18 @@
 # CHECK: global_atomic_add v[1:2], v2, off       ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00]
 0x00,0x80,0x08,0xdd,0x01,0x02,0x7f,0x00
 
+# CHECK: global_atomic_add v1, v2, s[6:7] nv     ; encoding: [0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00]
+0x00,0x80,0x08,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 nv ; encoding: [0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x08,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_add v0, v1, v2, s[6:7] offset:-1 glc nv ; encoding: [0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x09,0xdd,0x01,0x02,0x86,0x00
+
+# CHECK: global_atomic_add v1, v2, s[6:7] offset:-1 slc nv ; encoding: [0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00]
+0xff,0x9f,0x0a,0xdd,0x01,0x02,0x86,0x00
+
 # CHECK: global_atomic_sub v[1:2], v2, off offset:-1 ; encoding: [0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00]
 0xff,0x9f,0x0c,0xdd,0x01,0x02,0x7f,0x00
 
@@ -1503,6 +2103,18 @@
 # CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x42,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ubyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x44,0xdc,0x00,0x00,0x02,0x05
 
@@ -1542,6 +2154,18 @@
 # CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x46,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sbyte v5, off, s2 nv       ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x48,0xdc,0x00,0x00,0x02,0x05
 
@@ -1581,6 +2205,18 @@
 # CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4a,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ushort v5, off, s2 nv      ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ushort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ushort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ushort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4c,0xdc,0x00,0x00,0x02,0x05
 
@@ -1620,6 +2256,18 @@
 # CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x4e,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sshort v5, off, s2 nv      ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sshort v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sshort v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sshort v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x50,0xdc,0x00,0x00,0x02,0x05
 
@@ -1659,6 +2307,18 @@
 # CHECK: scratch_load_dword v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x52,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dword v5, off, s2 nv       ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dword v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dword v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dword v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x54,0xdc,0x00,0x00,0x02,0x05
 
@@ -1698,6 +2358,18 @@
 # CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x56,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 nv ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx2 v[5:6], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x58,0xdc,0x00,0x00,0x02,0x05
 
@@ -1737,6 +2409,18 @@
 # CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5a,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 nv ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx3 v[5:7], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5c,0xdc,0x00,0x00,0x02,0x05
 
@@ -1776,6 +2460,18 @@
 # CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x5e,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 nv ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_dwordx4 v[5:8], off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x60,0xdc,0x00,0x02,0x03,0x00
 
@@ -1815,6 +2511,18 @@
 # CHECK: scratch_store_byte off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x62,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_byte off, v2, s3 nv       ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x64,0xdc,0x00,0x02,0x03,0x00
 
@@ -1854,6 +2562,18 @@
 # CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x66,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_byte_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_short off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x68,0xdc,0x00,0x02,0x03,0x00
 
@@ -1893,6 +2613,18 @@
 # CHECK: scratch_store_short off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6a,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_short off, v2, s3 nv      ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6c,0xdc,0x00,0x02,0x03,0x00
 
@@ -1932,6 +2664,18 @@
 # CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x6e,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_short_d16_hi off, v2, s3 nv ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_short_d16_hi off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00
 
@@ -1971,6 +2715,18 @@
 # CHECK: scratch_store_dword off, v2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x72,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dword off, v2, s3 nv      ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dword off, v2, s3 offset:-1 nv ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dword off, v2, s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dword off, v2, s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x74,0xdc,0x00,0x02,0x03,0x00
 
@@ -2010,6 +2766,18 @@
 # CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x76,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 nv ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx2 off, v[2:3], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x78,0xdc,0x00,0x02,0x03,0x00
 
@@ -2049,6 +2817,18 @@
 # CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7a,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 nv ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx3 off, v[2:4], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7c,0xdc,0x00,0x02,0x03,0x00
 
@@ -2088,6 +2868,18 @@
 # CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00]
 0xff,0x5f,0x7e,0xdc,0x00,0x02,0x03,0x00
 
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 nv ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 nv ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 glc nv ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00
+
+# CHECK: scratch_store_dwordx4 off, v[2:5], s3 offset:-1 slc nv ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00]
+0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00
+
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x80,0xdc,0x00,0x00,0x02,0x05
 
@@ -2127,6 +2919,18 @@
 # CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x82,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x84,0xdc,0x00,0x00,0x02,0x05
 
@@ -2166,6 +2970,18 @@
 # CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x86,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_ubyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x88,0xdc,0x00,0x00,0x02,0x05
 
@@ -2205,6 +3021,18 @@
 # CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8a,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8c,0xdc,0x00,0x00,0x02,0x05
 
@@ -2244,6 +3072,18 @@
 # CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x8e,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_sbyte_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x90,0xdc,0x00,0x00,0x02,0x05
 
@@ -2283,6 +3123,18 @@
 # CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x92,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_short_d16 v5, off, s2 nv   ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16 v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x94,0xdc,0x00,0x00,0x02,0x05
 
@@ -2322,6 +3174,18 @@
 # CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05]
 0xff,0x5f,0x96,0xdc,0x00,0x00,0x02,0x05
 
+# CHECK: scratch_load_short_d16_hi v5, off, s2 nv ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05]
+0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 nv ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 glc nv ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05
+
+# CHECK: scratch_load_short_d16_hi v5, off, s2 offset:-1 slc nv ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05]
+0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05
+
 # CHECK: global_load_dword v[2:3], off lds       ; encoding: [0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00]
 0x00,0xa0,0x50,0xdc,0x02,0x00,0x7f,0x00
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index 2661ed5b04cc9..b27a50d93f5b9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -250,6 +250,9 @@
 #CHECK: vucmprhh 1, 3, 6
 0x10,0x23,0x31,0x03
 
+#CHECK: xvrlw 34, 15, 16
+0xf0,0x4f,0x85,0xc1
+
 #CHECK: xxaes192encp 8, 10, 14
 0xf1,0x0b,0x76,0x10
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 7fb8254ced0ac..72662d9736740 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -244,6 +244,9 @@
 #CHECK: vucmprhh 1, 3, 6
 0x03,0x31,0x23,0x10
 
+#CHECK: xvrlw 34, 15, 16
+0xc1,0x85,0x4f,0xf0
+
 #CHECK: xxaes192encp 8, 10, 14
 0x10,0x76,0x0b,0xf1
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index 40059c440b128..ab72649fc3404 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -355,6 +355,10 @@
 #CHECK-BE: vucmprhh 1, 3, 6               # encoding: [0x10,0x23,0x31,0x03]
 #CHECK-LE: vucmprhh 1, 3, 6               # encoding: [0x03,0x31,0x23,0x10]
 
+           xvrlw 34, 15, 16
+#CHECK-BE: xvrlw 34, 15, 16              # encoding: [0xf0,0x4f,0x85,0xc1]
+#CHECK-LE: xvrlw 34, 15, 16              # encoding: [0xc1,0x85,0x4f,0xf0]
+
            xxaes192encp 8, 10, 14
 #CHECK-BE: xxaes192encp 8, 10, 14         # encoding: [0xf1,0x0b,0x76,0x10]
 #CHECK-LE: xxaes192encp 8, 10, 14         # encoding: [0x10,0x76,0x0b,0xf1]
diff --git a/llvm/test/MC/Xtensa/s32c1i.s b/llvm/test/MC/Xtensa/s32c1i.s
new file mode 100644
index 0000000000000..218a86dd56752
--- /dev/null
+++ b/llvm/test/MC/Xtensa/s32c1i.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+s32c1i \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# CHECK-INST: xsr a3, atomctl
+# CHECK: # encoding: [0x30,0x63,0x61]
+xsr a3, atomctl
+
+# CHECK-INST: xsr a3, scompare1
+# CHECK: # encoding: [0x30,0x0c,0x61]
+xsr a3, scompare1
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
index 2904474f6110b..e4a7126d79fbd 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
@@ -53,21 +53,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:   });
 // CHECK-NEXT:   AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
-// CHECK-NEXT:        {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
-// CHECK-NEXT:        {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
+// CHECK-NEXT:        RTLIB::impl___divmodqi4, // __divmodqi4
+// CHECK-NEXT:        RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:      setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
@@ -80,21 +80,21 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:   });
 // CHECK-NEXT:   AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:   for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:     setLibcallImpl(Func, Impl);
+// CHECK-NEXT:   for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:     setAvailable(Impl);
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
-// CHECK-NEXT:   static const LibcallImplPair LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
-// CHECK-NEXT:       {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
-// CHECK-NEXT:       {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:   static const RTLIB::LibcallImpl LibraryCalls_AlwaysAvailable_AVR_BUILTIN[] = {
+// CHECK-NEXT:       RTLIB::impl___divmodqi4, // __divmodqi4
+// CHECK-NEXT:       RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:   };
 // CHECK-EMPTY:
-// CHECK-NEXT:   for (const auto [Func, Impl] : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
-// CHECK-NEXT:     setLibcallImpl(Func, Impl);
+// CHECK-NEXT:   for (const RTLIB::LibcallImpl Impl : LibraryCalls_AlwaysAvailable_AVR_BUILTIN) {
+// CHECK-NEXT:     setAvailable(Impl);
 // CHECK-NEXT:     setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
@@ -107,33 +107,33 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_malloc, // malloc
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if ( isFoo() ) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_anonymous_3_AVR_BUILTIN[] = {
-// CHECK-NEXT:          {RTLIB::SDIVREM_I8, RTLIB::impl___divmodqi4}, // __divmodqi4
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_anonymous_3_AVR_BUILTIN[] = {
+// CHECK-NEXT:          RTLIB::impl___divmodqi4, // __divmodqi4
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_anonymous_3_AVR_BUILTIN) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_3_AVR_BUILTIN) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:        setLibcallImplCallingConv(Impl, CallingConv::AVR_BUILTIN);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if ( isBar() ) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_anonymous_5_MSP430_BUILTIN[] = {
-// CHECK-NEXT:          {RTLIB::UDIVREM_I16, RTLIB::impl___udivmodhi4}, // __udivmodhi4
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_anonymous_5_MSP430_BUILTIN[] = {
+// CHECK-NEXT:          RTLIB::impl___udivmodhi4, // __udivmodhi4
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_anonymous_5_MSP430_BUILTIN) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_anonymous_5_MSP430_BUILTIN) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:        setLibcallImplCallingConv(Impl, CallingConv::MSP430_BUILTIN);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
index f9a148a183806..82206ce6ba254 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
@@ -31,12 +31,12 @@ def dup1 : RuntimeLibcallImpl<ANOTHER_DUP>;
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
 
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::SOME_FUNC, RTLIB::impl_func_b}, // func_b
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_func_b, // func_b
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
@@ -53,13 +53,13 @@ def TheSystemLibraryA : SystemRuntimeLibrary<isTargetArchA,
 // CHECK-NEXT: });
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT:        {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_other_func, // other_func
+// CHECK-NEXT:        RTLIB::impl_func_a, // func_a
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
@@ -76,14 +76,14 @@ def TheSystemLibraryB : SystemRuntimeLibrary<isTargetArchB,
 // CHECK-NEXT: });
 // CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:     static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:         {RTLIB::ANOTHER_DUP, RTLIB::impl_dup1}, // dup1
-// CHECK-NEXT:         {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT:         {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT:     static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:         RTLIB::impl_dup1, // dup1
+// CHECK-NEXT:         RTLIB::impl_other_func, // other_func
+// CHECK-NEXT:         RTLIB::impl_func_a, // func_a
 // CHECK-NEXT:     };
 // CHECK-EMPTY:
-// CHECK-NEXT:     for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:       setLibcallImpl(Func, Impl);
+// CHECK-NEXT:     for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:       setAvailable(Impl);
 // CHECK-NEXT:     }
 // CHECK-EMPTY:
 // CHECK-NEXT:    return;
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 7aaf3a0e8e1cf..2a1cc72efcd4b 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -200,10 +200,6 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: }
 
 // CHECK: void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets(const llvm::Triple &TT, ExceptionHandling ExceptionModel, FloatABI::ABIType FloatABI, EABI EABIVersion, StringRef ABIName) {
-// CHECK-NEXT:  struct LibcallImplPair {
-// CHECK-NEXT:    RTLIB::Libcall Func;
-// CHECK-NEXT:    RTLIB::LibcallImpl Impl;
-// CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT: if (TT.getArch() == Triple::blah) {
 // CHECK-NEXT:     static constexpr LibcallImplBitset SystemAvailableImpls({
@@ -211,35 +207,35 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:     });
 // CHECK-NEXT:     AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT:        {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT:        {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_bzero, // bzero
+// CHECK-NEXT:        RTLIB::impl_calloc, // calloc
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f128, // sqrtl
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.hasCompilerRT()) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_hasCompilerRT[] = {
-// CHECK-NEXT:          {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:          {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_hasCompilerRT[] = {
+// CHECK-NEXT:          RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:          RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_hasCompilerRT) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_hasCompilerRT) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.getOS() == Triple::bar) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_isBarOS[] = {
-// CHECK-NEXT:          {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = {
+// CHECK-NEXT:          RTLIB::impl____memset, // ___memset
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_isBarOS) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
@@ -253,14 +249,14 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:        {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
-// CHECK-NEXT:        {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f80, // sqrtl
+// CHECK-NEXT:        RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:   return;
@@ -272,22 +268,22 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT:        {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_bzero, // bzero
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f128, // sqrtl
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:    if (TT.getOS() == Triple::bar) {
-// CHECK-NEXT:      static const LibcallImplPair LibraryCalls_isBarOS[] = {
-// CHECK-NEXT:          {RTLIB::MEMSET, RTLIB::impl____memset}, // ___memset
+// CHECK-NEXT:      static const RTLIB::LibcallImpl LibraryCalls_isBarOS[] = {
+// CHECK-NEXT:          RTLIB::impl____memset, // ___memset
 // CHECK-NEXT:      };
 // CHECK-EMPTY:
-// CHECK-NEXT:      for (const auto [Func, Impl] : LibraryCalls_isBarOS) {
-// CHECK-NEXT:        setLibcallImpl(Func, Impl);
+// CHECK-NEXT:      for (const RTLIB::LibcallImpl Impl : LibraryCalls_isBarOS) {
+// CHECK-NEXT:        setAvailable(Impl);
 // CHECK-NEXT:      }
 // CHECK-EMPTY:
 // CHECK-NEXT:    }
@@ -301,15 +297,15 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT:    });
 // CHECK-NEXT:    AvailableLibcallImpls = SystemAvailableImpls;
 // CHECK-EMPTY:
-// CHECK-NEXT:    static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT:        {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT:        {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
-// CHECK-NEXT:        {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
-// CHECK-NEXT:        {RTLIB::SRL_I64, RTLIB::impl___lshrdi3}, // __lshrdi3
+// CHECK-NEXT:    static const RTLIB::LibcallImpl LibraryCalls[] = {
+// CHECK-NEXT:        RTLIB::impl_calloc, // calloc
+// CHECK-NEXT:        RTLIB::impl___ashlsi3, // __ashlsi3
+// CHECK-NEXT:        RTLIB::impl_sqrtl_f80, // sqrtl
+// CHECK-NEXT:        RTLIB::impl___lshrdi3, // __lshrdi3
 // CHECK-NEXT:    };
 // CHECK-EMPTY:
-// CHECK-NEXT:    for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT:      setLibcallImpl(Func, Impl);
+// CHECK-NEXT:    for (const RTLIB::LibcallImpl Impl : LibraryCalls) {
+// CHECK-NEXT:      setAvailable(Impl);
 // CHECK-NEXT:    }
 // CHECK-EMPTY:
 // CHECK-NEXT:   return;
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
index 0c5c63db4c95b..cc0f87755cdc2 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/no-nodes.td
@@ -20,6 +20,7 @@ def MyTarget : Target;
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
 // CHECK-EMPTY:
+// CHECK-EMPTY:
 // CHECK-NEXT:  #ifdef __GNUC__
 // CHECK-NEXT:  #pragma GCC diagnostic push
 // CHECK-NEXT:  #pragma GCC diagnostic ignored "-Woverlength-strings"
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 8648651f3d714..5bd7890e0ddd1 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -186,8 +186,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL:       #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-NEXT:  #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
-// IMPL-NEXT:  namespace llvm {
-// IMPL-NEXT:  namespace tdl {
+// IMPL-NEXT:  namespace llvm::tdl {
 // IMPL-EMPTY:
 // IMPL-NEXT:    // Sets for dira
 // IMPL-EMPTY:
@@ -204,8 +203,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL-EMPTY:
 // IMPL-NEXT:    static  requiredClauses_TDLD_dira {
 // IMPL-NEXT:    };
-// IMPL-NEXT:  } // namespace tdl
-// IMPL-NEXT:  } // namespace llvm
+// IMPL-EMPTY:
+// IMPL-NEXT:  } // namespace llvm::tdl
 // IMPL-EMPTY:
 // IMPL-NEXT:  #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 96022d7647440..eaaf82ddaaf41 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -159,8 +159,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL:      #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-NEXT: #undef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
-// IMPL-NEXT: namespace llvm {
-// IMPL-NEXT: namespace tdl {
+// IMPL-NEXT: namespace llvm::tdl {
 // IMPL-EMPTY:
 // IMPL-NEXT:   // Sets for dira
 // IMPL-EMPTY:
@@ -177,8 +176,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // IMPL-EMPTY:
 // IMPL-NEXT:   static  requiredClauses_TDLD_dira {
 // IMPL-NEXT:   };
-// IMPL-NEXT: } // namespace tdl
-// IMPL-NEXT: } // namespace llvm
+// IMPL-EMPTY:
+// IMPL-NEXT: } // namespace llvm::tdl
 // IMPL-EMPTY:
 // IMPL-NEXT: #endif // GEN_FLANG_DIRECTIVE_CLAUSE_SETS
 // IMPL-EMPTY:
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
index 8ffacb9bdd5f6..1b728f56ab2ea 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -1,7 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=aarch64-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
-define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
+define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) !prof !0 {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -14,7 +14,7 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
 ; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
 ; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret float [[TMP5]]
 ;
@@ -336,3 +336,11 @@ define <2 x half> @atomicrmw_fminimum_2_x_half(ptr %ptr, <2 x half> %val) {
   %res = atomicrmw fminimum ptr %ptr, <2 x half> %val seq_cst
   ret <2 x half> %res
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 97ea2c6708dad..2828882afe779 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 @E = common global ptr null, align 8
 
 ; CHECK-LABEL: @f(
-define void @f() {
+define void @f() !prof !{!"function_entry_count", i32 10} {
 entry:
   %a = load ptr, ptr @A, align 8
   %b = load ptr, ptr @B, align 8
@@ -55,7 +55,7 @@ entry:
 ; CHECK:     = icmp
 
 ; CHECK-NOT: = icmp
-; CHECK:     br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1
+; CHECK:     br i1 %conflict.rdx15, label %for.body.ph.lver.orig, label %for.body.ph.ldist1, !prof ![[PROF1:[0-9]]]
 
 ; The non-distributed loop that the memchecks fall back on.
 
@@ -289,3 +289,4 @@ attributes #1 = { nounwind convergent }
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: ![[PROF1]] = !{!"unknown", !"loop-versioning"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
new file mode 100644
index 0000000000000..bd5f4e2a3279b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16  -S < %s | FileCheck %s
+
+define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) {
+; CHECK-LABEL: define dso_local void @_Z6unpackPhS_(
+; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) {
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:    store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ]
+  %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ]
+  %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ]
+  store i8 0, ptr %out.addr.032, align 1
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3
+  %0 = load i8, ptr %arrayidx10, align 1
+  %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1
+  store i8 %0, ptr %arrayidx14, align 1
+  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2
+  %1 = load i8, ptr %arrayidx10.1, align 1
+  %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2
+  store i8 %1, ptr %arrayidx14.1, align 1
+  %add.2 = add i8 %0, %1
+  %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3
+  store i8 %add.2, ptr %arrayidx14.2, align 1
+  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1
+  %2 = load i8, ptr %arrayidx10.3, align 1
+  %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4
+  store i8 %2, ptr %arrayidx14.3, align 1
+  %add.4 = add i8 %0, %2
+  %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5
+  store i8 %add.4, ptr %arrayidx14.4, align 1
+  %add.5 = add i8 %1, %2
+  %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6
+  store i8 %add.5, ptr %arrayidx14.5, align 1
+  %add.6 = add i8 %0, %add.5
+  %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7
+  store i8 %add.6, ptr %arrayidx14.6, align 1
+  %3 = load i8, ptr %in.addr.031, align 1
+  %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8
+  store i8 %3, ptr %arrayidx14.7, align 1
+  %add.8 = add i8 %0, %3
+  %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9
+  store i8 %add.8, ptr %arrayidx14.8, align 1
+  %add.9 = add i8 %1, %3
+  %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10
+  store i8 %add.9, ptr %arrayidx14.9, align 1
+  %add.10 = add i8 %0, %add.9
+  %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11
+  store i8 %add.10, ptr %arrayidx14.10, align 1
+  %add.11 = add i8 %2, %3
+  %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12
+  store i8 %add.11, ptr %arrayidx14.11, align 1
+  %add.12 = add i8 %0, %add.11
+  %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13
+  store i8 %add.12, ptr %arrayidx14.12, align 1
+  %add.13 = add i8 %1, %add.11
+  %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14
+  store i8 %add.13, ptr %arrayidx14.13, align 1
+  %add.14 = add i8 %0, %add.13
+  %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15
+  store i8 %add.14, ptr %arrayidx14.14, align 1
+  %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16
+  %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4
+  %inc17 = add nuw nsw i32 %i.033, 1
+  %exitcond.not = icmp eq i32 %inc17, 32
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index 68cfc659e1e94..cdddcc9fc4226 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
-; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s
 
 target triple = "arm64-apple-macosx15.0.0"
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
new file mode 100644
index 0000000000000..bba7d058d6637
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
@@ -0,0 +1,694 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF4 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @test_2xi64_matching_zext_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_zext_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_zext_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_matching_sext_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_sext_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_sext_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = sext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = sext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP4:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = sext i32 %l.0 to i64
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_matching_cast_add_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_matching_cast_add_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_matching_cast_add_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2)
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = zext i32 %l.0 to i64
+  %add.0 = add i64 %ext.0, 2
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %add.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  %add.1 = add i64 %ext.1, 2
+  store i64 %add.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_cast_add_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
+; VF2-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP5]], splat (i64 2)
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_cast_add_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP2]], splat (i64 2)
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP5]], splat (i64 2)
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %ext.0 = sext i32 %l.0 to i64
+  %add.0 = add i64 %ext.0, 2
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %add.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  %ext.1 = zext i32 %l.0 to i64
+  %add.1 = add i64 %ext.1, 2
+  store i64 %add.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_add_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_add_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = add i32 %l.0, 2
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_mismatching_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64>
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_mismatching_add_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64>
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = sub i32 %l.0, 2
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_add_mismatching_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_add_mismatching_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = add i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = add i32 %l.0, 2
+  %ext.1 = sext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(ptr noalias %dst, ptr %src) {
+; VF2-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(
+; VF2-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF2-NEXT:  [[ENTRY:.*:]]
+; VF2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2:       [[VECTOR_PH]]:
+; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2:       [[VECTOR_BODY]]:
+; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP5:%.*]] = sub <2 x i32> splat (i32 2), [[WIDE_LOAD]]
+; VF2-NEXT:    [[TMP6:%.*]] = zext <2 x i32> [[TMP5]] to <2 x i64>
+; VF2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2:       [[MIDDLE_BLOCK]]:
+; VF2-NEXT:    br label %[[EXIT:.*]]
+; VF2:       [[EXIT]]:
+; VF2-NEXT:    ret void
+;
+; VF4-LABEL: define void @test_2xi64_sub_mismatching_ops_cast_interleave_group(
+; VF4-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) {
+; VF4-NEXT:  [[ENTRY:.*:]]
+; VF4-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF4:       [[VECTOR_PH]]:
+; VF4-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF4:       [[VECTOR_BODY]]:
+; VF4-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF4-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF4-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[WIDE_LOAD]], splat (i32 2)
+; VF4-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> splat (i32 2), [[WIDE_LOAD]]
+; VF4-NEXT:    [[TMP6:%.*]] = zext <4 x i32> [[TMP5]] to <4 x i64>
+; VF4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VF4-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; VF4-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
+; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF4-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
+; VF4:       [[EXIT]]:
+; VF4-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %idx.0 = shl nsw i64 %iv, 1
+  %gep.src.0 = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l.0 = load i32 , ptr %gep.src.0, align 8
+  %add.0 = sub i32 %l.0, 2
+  %ext.0 = zext i32 %add.0 to i64
+  %dst.0 = getelementptr inbounds i64, ptr %dst, i64 %idx.0
+  store i64 %ext.0, ptr %dst.0, align 8
+  %idx.1 = add i64 %idx.0, 1
+  %add.1 = sub i32 2, %l.0
+  %ext.1 = zext i32 %add.1 to i64
+  %dst.1 = getelementptr inbounds i64, ptr %dst, i64 %idx.1
+  store i64 %ext.1, ptr %dst.1, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
index f2ae327778f4a..54b7f2afe1ed0 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll
@@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef
 ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]]
-; CHECK-NEXT:    store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]]
+; CHECK-NEXT:    store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144
 ; CHECK-NEXT:    br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[FOR_END11]]:
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
index d818335f075e5..e48c2b46a138a 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-of-powers-of-two.ll
@@ -141,5 +141,5 @@ return:
 ;.
 ; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 58, i32 5}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 56, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 53, i32 5}
 ;.
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
index c005316f07f06..4c8c829a59f3c 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
@@ -10,10 +10,15 @@ define float @sinf(float %x) {
   ret float %x
 }
 
+; CHECK: declare void @_Unwind_Resume(...)
+
+; CHECK: declare void @__umodti3(...)
+
 ; CHECK: declare void @acosf(...)
 
-; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
 ; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]]
 
-; CHECK: declare void @__umodti3(...)
+; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
+
+; CHECK: declare void @truncl(...)
 
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
index f0f09e97d9dba..57cb016bcb7f3 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -7,14 +7,14 @@
 ; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %}
 ; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %}
 
-; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 ; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 
-; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 ; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 
-; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
 ; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]]
+; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
 
 ; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) }
 ; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) }
diff --git a/llvm/utils/TableGen/Basic/CMakeLists.txt b/llvm/utils/TableGen/Basic/CMakeLists.txt
index b4a66ecce6440..2030e9add7f30 100644
--- a/llvm/utils/TableGen/Basic/CMakeLists.txt
+++ b/llvm/utils/TableGen/Basic/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_library(LLVMTableGenBasic OBJECT EXCLUDE_FROM_ALL DISABLE_LLVM_LINK_LLV
   IntrinsicEmitter.cpp
   RISCVTargetDefEmitter.cpp
   RuntimeLibcallsEmitter.cpp
+  RuntimeLibcalls.cpp
   SDNodeProperties.cpp
   TableGen.cpp
   TargetFeaturesEmitter.cpp
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index d33bf45595e2e..0bb743dc8a7f5 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -359,7 +359,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     OS << "  static constexpr bool is_iterable = true;\n";
     OS << "};\n";
   }
-  LlvmNS.close();
 }
 
 // Given a list of spellings (for a given clause/directive), order them
@@ -931,27 +930,20 @@ static void generateClauseSet(ArrayRef<const Record *> VerClauses,
 // Generate an enum set for the 4 kinds of clauses linked to a directive.
 static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
                                         Frontend FE, raw_ostream &OS) {
+  IfDefEmitter Scope(OS, "GEN_" + getFESpelling(FE).upper() +
+                             "_DIRECTIVE_CLAUSE_SETS");
 
-  std::string IfDefName{"GEN_"};
-  IfDefName += getFESpelling(FE).upper();
-  IfDefName += "_DIRECTIVE_CLAUSE_SETS";
-  IfDefEmitter Scope(OS, IfDefName);
-
-  StringRef Namespace =
-      getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE);
+  std::string Namespace =
+      getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE).str();
   // The namespace has to be different for clang vs flang, as 2 structs with the
   // same name but different layout is UB.  So just put the 'clang' on in the
   // clang namespace.
-  OS << "namespace " << Namespace << " {\n";
-
-  // Open namespaces defined in the directive language.
-  SmallVector<StringRef, 2> Namespaces;
-  SplitString(DirLang.getCppNamespace(), Namespaces, "::");
-  for (auto Ns : Namespaces)
-    OS << "namespace " << Ns << " {\n";
+  // Additionally, open namespaces defined in the directive language.
+  if (!DirLang.getCppNamespace().empty())
+    Namespace += "::" + DirLang.getCppNamespace().str();
+  NamespaceEmitter NS(OS, Namespace);
 
   for (const Directive Dir : DirLang.getDirectives()) {
-    OS << "\n";
     OS << "// Sets for " << Dir.getSpellingForIdentifier() << "\n";
 
     generateClauseSet(Dir.getAllowedClauses(), OS, "allowedClauses_", Dir,
@@ -963,12 +955,6 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
     generateClauseSet(Dir.getRequiredClauses(), OS, "requiredClauses_", Dir,
                       DirLang, FE);
   }
-
-  // Closing namespaces
-  for (auto Ns : reverse(Namespaces))
-    OS << "} // namespace " << Ns << "\n";
-
-  OS << "} // namespace " << Namespace << "\n";
 }
 
 // Generate a map of directive (key) with DirectiveClauses struct as values.
@@ -976,10 +962,8 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
 // allowances (allowed, allowed once, allowed exclusive and required).
 static void generateDirectiveClauseMap(const DirectiveLanguage &DirLang,
                                        Frontend FE, raw_ostream &OS) {
-  std::string IfDefName{"GEN_"};
-  IfDefName += getFESpelling(FE).upper();
-  IfDefName += "_DIRECTIVE_CLAUSE_MAP";
-  IfDefEmitter Scope(OS, IfDefName);
+  IfDefEmitter Scope(OS, "GEN_" + getFESpelling(FE).upper() +
+                             "_DIRECTIVE_CLAUSE_MAP");
 
   OS << "{\n";
 
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
new file mode 100644
index 0000000000000..1e609a2a8880b
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcalls.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RuntimeLibcalls.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+RuntimeLibcalls::RuntimeLibcalls(const RecordKeeper &Records) {
+  ArrayRef<const Record *> AllRuntimeLibcalls =
+      Records.getAllDerivedDefinitions("RuntimeLibcall");
+
+  RuntimeLibcallDefList.reserve(AllRuntimeLibcalls.size());
+
+  size_t CallTypeEnumVal = 0;
+  for (const Record *RuntimeLibcallDef : AllRuntimeLibcalls) {
+    RuntimeLibcallDefList.emplace_back(RuntimeLibcallDef, CallTypeEnumVal++);
+    Def2RuntimeLibcall[RuntimeLibcallDef] = &RuntimeLibcallDefList.back();
+  }
+
+  for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
+    Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
+
+  ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
+      Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
+
+  SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
+      AllRuntimeLibcallImplsRaw);
+
+  // Sort by libcall impl name and secondarily by the enum name.
+  sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
+    return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
+           std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
+  });
+
+  RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
+
+  size_t LibCallImplEnumVal = 1;
+  for (const Record *LibCallImplDef : AllRuntimeLibcallImpls) {
+    RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
+                                           LibCallImplEnumVal++);
+
+    const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
+    Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
+
+    if (LibCallImpl.isDefault()) {
+      const RuntimeLibcall *Provides = LibCallImpl.getProvides();
+      if (!Provides)
+        PrintFatalError(LibCallImplDef->getLoc(),
+                        "default implementations must provide a libcall");
+      LibCallToDefaultImpl[Provides] = &LibCallImpl;
+    }
+  }
+}
+
+void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
+                                      SetTheory::RecSet &Elts) {
+  assert(Def->isSubClassOf("LibcallImpls"));
+
+  SetTheory::RecSet TmpElts;
+
+  ST.evaluate(Def->getValueInit("MemberList"), TmpElts, Def->getLoc());
+
+  Elts.insert(TmpElts.begin(), TmpElts.end());
+
+  AvailabilityPredicate AP(Def->getValueAsDef("AvailabilityPredicate"));
+  const Record *CCClass = Def->getValueAsOptionalDef("CallingConv");
+
+  // This is assuming we aren't conditionally applying a calling convention to
+  // some subsets, and not another, but this doesn't appear to be used.
+
+  for (const Record *LibcallImplDef : TmpElts) {
+    const RuntimeLibcallImpl *LibcallImpl =
+        Libcalls.getRuntimeLibcallImpl(LibcallImplDef);
+    if (!AP.isAlwaysAvailable() || CCClass) {
+      auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
+      if (!Inserted) {
+        PrintError(
+            Def,
+            "combining nested libcall set predicates currently unhandled: '" +
+                LibcallImpl->getLibcallFuncName() + "'");
+      }
+
+      It->second.first.push_back(AP.getDef());
+      It->second.second = CCClass;
+    }
+  }
+}
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcalls.h b/llvm/utils/TableGen/Basic/RuntimeLibcalls.h
new file mode 100644
index 0000000000000..6c9897602b2fa
--- /dev/null
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcalls.h
@@ -0,0 +1,189 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
+#define LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
+
+namespace llvm {
+
+class AvailabilityPredicate {
+  const Record *TheDef;
+  StringRef PredicateString;
+
+public:
+  AvailabilityPredicate(const Record *Def) : TheDef(Def) {
+    if (TheDef)
+      PredicateString = TheDef->getValueAsString("Cond");
+  }
+
+  const Record *getDef() const { return TheDef; }
+
+  bool isAlwaysAvailable() const { return PredicateString.empty(); }
+
+  void emitIf(raw_ostream &OS) const {
+    OS << "if (" << PredicateString << ") {\n";
+  }
+
+  void emitEndIf(raw_ostream &OS) const { OS << "}\n"; }
+
+  void emitTableVariableNameSuffix(raw_ostream &OS) const {
+    if (TheDef)
+      OS << '_' << TheDef->getName();
+  }
+};
+
+class RuntimeLibcalls;
+class RuntimeLibcallImpl;
+
+/// Used to apply predicates to nested sets of libcalls.
+struct LibcallPredicateExpander : SetTheory::Expander {
+  const RuntimeLibcalls &Libcalls;
+  DenseMap<const RuntimeLibcallImpl *,
+           std::pair<std::vector<const Record *>, const Record *>> &Func2Preds;
+
+  LibcallPredicateExpander(
+      const RuntimeLibcalls &Libcalls,
+      DenseMap<const RuntimeLibcallImpl *,
+               std::pair<std::vector<const Record *>, const Record *>>
+          &Func2Preds)
+      : Libcalls(Libcalls), Func2Preds(Func2Preds) {}
+
+  void expand(SetTheory &ST, const Record *Def,
+              SetTheory::RecSet &Elts) override;
+};
+
+class RuntimeLibcall {
+  const Record *TheDef = nullptr;
+  const size_t EnumVal;
+
+public:
+  RuntimeLibcall() = delete;
+  RuntimeLibcall(const Record *Def, size_t EnumVal)
+      : TheDef(Def), EnumVal(EnumVal) {
+    assert(Def);
+  }
+
+  ~RuntimeLibcall() { assert(TheDef); }
+
+  const Record *getDef() const { return TheDef; }
+
+  StringRef getName() const { return TheDef->getName(); }
+
+  size_t getEnumVal() const { return EnumVal; }
+
+  void emitEnumEntry(raw_ostream &OS) const {
+    OS << "RTLIB::" << TheDef->getValueAsString("Name");
+  }
+};
+
+class RuntimeLibcallImpl {
+  const Record *TheDef;
+  const RuntimeLibcall *Provides = nullptr;
+  const size_t EnumVal;
+
+public:
+  RuntimeLibcallImpl(
+      const Record *Def,
+      const DenseMap<const Record *, const RuntimeLibcall *> &ProvideMap,
+      size_t EnumVal)
+      : TheDef(Def), EnumVal(EnumVal) {
+    if (const Record *ProvidesDef = Def->getValueAsDef("Provides"))
+      Provides = ProvideMap.lookup(ProvidesDef);
+  }
+
+  ~RuntimeLibcallImpl() = default;
+
+  const Record *getDef() const { return TheDef; }
+
+  StringRef getName() const { return TheDef->getName(); }
+
+  size_t getEnumVal() const { return EnumVal; }
+
+  const RuntimeLibcall *getProvides() const { return Provides; }
+
+  StringRef getLibcallFuncName() const {
+    return TheDef->getValueAsString("LibCallFuncName");
+  }
+
+  const Record *getCallingConv() const {
+    return TheDef->getValueAsOptionalDef("CallingConv");
+  }
+
+  void emitQuotedLibcallFuncName(raw_ostream &OS) const {
+    OS << '\"' << getLibcallFuncName() << '\"';
+  }
+
+  bool isDefault() const { return TheDef->getValueAsBit("IsDefault"); }
+
+  void emitEnumEntry(raw_ostream &OS) const {
+    OS << "RTLIB::impl_" << this->getName();
+  }
+
+  void emitSetImplCall(raw_ostream &OS) const {
+    OS << "setLibcallImpl(";
+    Provides->emitEnumEntry(OS);
+    OS << ", ";
+    emitEnumEntry(OS);
+    OS << "); // " << getLibcallFuncName() << '\n';
+  }
+
+  void emitTableEntry(raw_ostream &OS) const {
+    OS << '{';
+    Provides->emitEnumEntry(OS);
+    OS << ", ";
+    emitEnumEntry(OS);
+    OS << "}, // " << getLibcallFuncName() << '\n';
+  }
+
+  void emitSetCallingConv(raw_ostream &OS) const {}
+};
+
+struct LibcallsWithCC {
+  std::vector<const RuntimeLibcallImpl *> LibcallImpls;
+  const Record *CallingConv = nullptr;
+};
+
+class RuntimeLibcalls {
+private:
+  DenseMap<const Record *, const RuntimeLibcall *> Def2RuntimeLibcall;
+  DenseMap<const Record *, const RuntimeLibcallImpl *> Def2RuntimeLibcallImpl;
+
+  std::vector<RuntimeLibcall> RuntimeLibcallDefList;
+  std::vector<RuntimeLibcallImpl> RuntimeLibcallImplDefList;
+
+  DenseMap<const RuntimeLibcall *, const RuntimeLibcallImpl *>
+      LibCallToDefaultImpl;
+
+public:
+  RuntimeLibcalls(const RecordKeeper &Records);
+
+  ArrayRef<RuntimeLibcall> getRuntimeLibcallDefList() const {
+    return RuntimeLibcallDefList;
+  }
+
+  ArrayRef<RuntimeLibcallImpl> getRuntimeLibcallImplDefList() const {
+    return RuntimeLibcallImplDefList;
+  }
+
+  const RuntimeLibcall *getRuntimeLibcall(const Record *Def) const {
+    return Def2RuntimeLibcall.lookup(Def);
+  }
+
+  const RuntimeLibcallImpl *getRuntimeLibcallImpl(const Record *Def) const {
+    return Def2RuntimeLibcallImpl.lookup(Def);
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_UTILS_TABLEGEN_COMMON_RUNTIMELIBCALLS_H
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 6a36f471678bf..7aca87a63d0a2 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -8,6 +8,8 @@
 
 #define DEBUG_TYPE "runtime-libcall-emitter"
 
+#include "RuntimeLibcalls.h"
+
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
@@ -65,160 +67,12 @@ template <> struct DenseMapInfo<PredicateWithCC, void> {
     return LHS == RHS;
   }
 };
-} // namespace llvm
-
-namespace {
-
-class AvailabilityPredicate {
-  const Record *TheDef;
-  StringRef PredicateString;
-
-public:
-  AvailabilityPredicate(const Record *Def) : TheDef(Def) {
-    if (TheDef)
-      PredicateString = TheDef->getValueAsString("Cond");
-  }
-
-  const Record *getDef() const { return TheDef; }
-
-  bool isAlwaysAvailable() const { return PredicateString.empty(); }
-
-  void emitIf(raw_ostream &OS) const {
-    OS << "if (" << PredicateString << ") {\n";
-  }
-
-  void emitEndIf(raw_ostream &OS) const { OS << "}\n"; }
-
-  void emitTableVariableNameSuffix(raw_ostream &OS) const {
-    if (TheDef)
-      OS << '_' << TheDef->getName();
-  }
-};
-
-class RuntimeLibcallEmitter;
-class RuntimeLibcallImpl;
-
-/// Used to apply predicates to nested sets of libcalls.
-struct LibcallPredicateExpander : SetTheory::Expander {
-  const RuntimeLibcallEmitter &LibcallEmitter;
-  DenseMap<const RuntimeLibcallImpl *,
-           std::pair<std::vector<const Record *>, const Record *>> &Func2Preds;
-
-  LibcallPredicateExpander(
-      const RuntimeLibcallEmitter &LibcallEmitter,
-      DenseMap<const RuntimeLibcallImpl *,
-               std::pair<std::vector<const Record *>, const Record *>>
-          &Func2Preds)
-      : LibcallEmitter(LibcallEmitter), Func2Preds(Func2Preds) {}
-
-  void expand(SetTheory &ST, const Record *Def,
-              SetTheory::RecSet &Elts) override;
-};
-
-class RuntimeLibcall {
-  const Record *TheDef = nullptr;
-  const size_t EnumVal;
-
-public:
-  RuntimeLibcall() = delete;
-  RuntimeLibcall(const Record *Def, size_t EnumVal)
-      : TheDef(Def), EnumVal(EnumVal) {
-    assert(Def);
-  }
-
-  ~RuntimeLibcall() { assert(TheDef); }
-
-  const Record *getDef() const { return TheDef; }
-
-  StringRef getName() const { return TheDef->getName(); }
-
-  size_t getEnumVal() const { return EnumVal; }
-
-  void emitEnumEntry(raw_ostream &OS) const {
-    OS << "RTLIB::" << TheDef->getValueAsString("Name");
-  }
-};
-
-class RuntimeLibcallImpl {
-  const Record *TheDef;
-  const RuntimeLibcall *Provides = nullptr;
-  const size_t EnumVal;
-
-public:
-  RuntimeLibcallImpl(
-      const Record *Def,
-      const DenseMap<const Record *, const RuntimeLibcall *> &ProvideMap,
-      size_t EnumVal)
-      : TheDef(Def), EnumVal(EnumVal) {
-    if (const Record *ProvidesDef = Def->getValueAsDef("Provides"))
-      Provides = ProvideMap.lookup(ProvidesDef);
-  }
-
-  ~RuntimeLibcallImpl() = default;
-
-  const Record *getDef() const { return TheDef; }
-
-  StringRef getName() const { return TheDef->getName(); }
-
-  size_t getEnumVal() const { return EnumVal; }
-
-  const RuntimeLibcall *getProvides() const { return Provides; }
-
-  StringRef getLibcallFuncName() const {
-    return TheDef->getValueAsString("LibCallFuncName");
-  }
-
-  const Record *getCallingConv() const {
-    return TheDef->getValueAsOptionalDef("CallingConv");
-  }
-
-  void emitQuotedLibcallFuncName(raw_ostream &OS) const {
-    OS << '\"' << getLibcallFuncName() << '\"';
-  }
-
-  bool isDefault() const { return TheDef->getValueAsBit("IsDefault"); }
-
-  void emitEnumEntry(raw_ostream &OS) const {
-    OS << "RTLIB::impl_" << this->getName();
-  }
-
-  void emitSetImplCall(raw_ostream &OS) const {
-    OS << "setLibcallImpl(";
-    Provides->emitEnumEntry(OS);
-    OS << ", ";
-    emitEnumEntry(OS);
-    OS << "); // " << getLibcallFuncName() << '\n';
-  }
-
-  void emitTableEntry(raw_ostream &OS) const {
-    OS << '{';
-    Provides->emitEnumEntry(OS);
-    OS << ", ";
-    emitEnumEntry(OS);
-    OS << "}, // " << getLibcallFuncName() << '\n';
-  }
-
-  void emitSetCallingConv(raw_ostream &OS) const {}
-};
-
-struct LibcallsWithCC {
-  std::vector<const RuntimeLibcallImpl *> LibcallImpls;
-  const Record *CallingConv = nullptr;
-};
 
 class RuntimeLibcallEmitter {
 private:
   const RecordKeeper &Records;
-  DenseMap<const Record *, const RuntimeLibcall *> Def2RuntimeLibcall;
-  DenseMap<const Record *, const RuntimeLibcallImpl *> Def2RuntimeLibcallImpl;
+  RuntimeLibcalls Libcalls;
 
-  std::vector<RuntimeLibcall> RuntimeLibcallDefList;
-  std::vector<RuntimeLibcallImpl> RuntimeLibcallImplDefList;
-
-  DenseMap<const RuntimeLibcall *, const RuntimeLibcallImpl *>
-      LibCallToDefaultImpl;
-
-private:
   void emitGetRuntimeLibcallEnum(raw_ostream &OS) const;
 
   void emitNameMatchHashTable(raw_ostream &OS,
@@ -229,61 +83,7 @@ class RuntimeLibcallEmitter {
   void emitSystemRuntimeLibrarySetCalls(raw_ostream &OS) const;
 
 public:
-  RuntimeLibcallEmitter(const RecordKeeper &R) : Records(R) {
-
-    ArrayRef<const Record *> AllRuntimeLibcalls =
-        Records.getAllDerivedDefinitions("RuntimeLibcall");
-
-    RuntimeLibcallDefList.reserve(AllRuntimeLibcalls.size());
-
-    size_t CallTypeEnumVal = 0;
-    for (const Record *RuntimeLibcallDef : AllRuntimeLibcalls) {
-      RuntimeLibcallDefList.emplace_back(RuntimeLibcallDef, CallTypeEnumVal++);
-      Def2RuntimeLibcall[RuntimeLibcallDef] = &RuntimeLibcallDefList.back();
-    }
-
-    for (RuntimeLibcall &LibCall : RuntimeLibcallDefList)
-      Def2RuntimeLibcall[LibCall.getDef()] = &LibCall;
-
-    ArrayRef<const Record *> AllRuntimeLibcallImplsRaw =
-        Records.getAllDerivedDefinitions("RuntimeLibcallImpl");
-
-    SmallVector<const Record *, 1024> AllRuntimeLibcallImpls(
-        AllRuntimeLibcallImplsRaw);
-
-    // Sort by libcall impl name and secondarily by the enum name.
-    sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) {
-      return std::pair(A->getValueAsString("LibCallFuncName"), A->getName()) <
-             std::pair(B->getValueAsString("LibCallFuncName"), B->getName());
-    });
-
-    RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size());
-
-    size_t LibCallImplEnumVal = 1;
-    for (const Record *LibCallImplDef : AllRuntimeLibcallImpls) {
-      RuntimeLibcallImplDefList.emplace_back(LibCallImplDef, Def2RuntimeLibcall,
-                                             LibCallImplEnumVal++);
-
-      const RuntimeLibcallImpl &LibCallImpl = RuntimeLibcallImplDefList.back();
-      Def2RuntimeLibcallImpl[LibCallImplDef] = &LibCallImpl;
-
-      if (LibCallImpl.isDefault()) {
-        const RuntimeLibcall *Provides = LibCallImpl.getProvides();
-        if (!Provides)
-          PrintFatalError(LibCallImplDef->getLoc(),
-                          "default implementations must provide a libcall");
-        LibCallToDefaultImpl[Provides] = &LibCallImpl;
-      }
-    }
-  }
-
-  const RuntimeLibcall *getRuntimeLibcall(const Record *Def) const {
-    return Def2RuntimeLibcall.lookup(Def);
-  }
-
-  const RuntimeLibcallImpl *getRuntimeLibcallImpl(const Record *Def) const {
-    return Def2RuntimeLibcallImpl.lookup(Def);
-  }
+  RuntimeLibcallEmitter(const RecordKeeper &R) : Records(R), Libcalls(R) {}
 
   void run(raw_ostream &OS);
 };
@@ -297,24 +97,25 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
         "namespace RTLIB {\n"
         "enum Libcall : unsigned short {\n";
 
-  for (const RuntimeLibcall &LibCall : RuntimeLibcallDefList) {
+  for (const RuntimeLibcall &LibCall : Libcalls.getRuntimeLibcallDefList()) {
     StringRef Name = LibCall.getName();
     OS << "  " << Name << " = " << LibCall.getEnumVal() << ",\n";
   }
 
-  OS << "  UNKNOWN_LIBCALL = " << RuntimeLibcallDefList.size()
+  OS << "  UNKNOWN_LIBCALL = " << Libcalls.getRuntimeLibcallDefList().size()
      << "\n};\n\n"
         "enum LibcallImpl : unsigned short {\n"
         "  Unsupported = 0,\n";
 
-  for (const RuntimeLibcallImpl &LibCall : RuntimeLibcallImplDefList) {
+  for (const RuntimeLibcallImpl &LibCall :
+       Libcalls.getRuntimeLibcallImplDefList()) {
     OS << "  impl_" << LibCall.getName() << " = " << LibCall.getEnumVal()
        << ", // " << LibCall.getLibcallFuncName() << '\n';
   }
 
   OS << "};\n"
      << "constexpr size_t NumLibcallImpls = "
-     << RuntimeLibcallImplDefList.size() + 1
+     << Libcalls.getRuntimeLibcallImplDefList().size() + 1
      << ";\n"
         "} // End namespace RTLIB\n"
         "} // End namespace llvm\n";
@@ -394,6 +195,8 @@ constructPerfectHashTable(ArrayRef<RuntimeLibcallImpl> Keywords,
 /// Generate hash table based lookup by name.
 void RuntimeLibcallEmitter::emitNameMatchHashTable(
     raw_ostream &OS, StringToOffsetTable &OffsetTable) const {
+  ArrayRef<RuntimeLibcallImpl> RuntimeLibcallImplDefList =
+      Libcalls.getRuntimeLibcallImplDefList();
   std::vector<uint64_t> Hashes(RuntimeLibcallImplDefList.size());
   std::vector<unsigned> TableValues(RuntimeLibcallImplDefList.size());
   DenseSet<StringRef> SeenFuncNames;
@@ -495,7 +298,8 @@ void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
   {
     IfDefEmitter IfDef(OS, "GET_INIT_RUNTIME_LIBCALL_NAMES");
 
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList())
       Table.GetOrAddStringOffset(LibCallImpl.getLibcallFuncName());
 
     Table.EmitStringTableDef(OS, "RuntimeLibcallImplNameTable");
@@ -505,7 +309,8 @@ const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
 
     OS << formatv("  {}, // {}\n", Table.GetStringOffset(""),
                   ""); // Unsupported entry
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList()) {
       StringRef ImplName = LibCallImpl.getLibcallFuncName();
       OS << formatv("  {}, // {}\n", Table.GetStringOffset(ImplName), ImplName);
     }
@@ -516,7 +321,8 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
 )";
 
     OS << "  0,\n";
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList())
       OS << "  " << LibCallImpl.getLibcallFuncName().size() << ",\n";
     OS << "};\n\n";
 
@@ -525,7 +331,8 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
           "ImplToLibcall[RTLIB::NumLibcallImpls] = {\n"
           "  RTLIB::UNKNOWN_LIBCALL, // RTLIB::Unsupported\n";
 
-    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+    for (const RuntimeLibcallImpl &LibCallImpl :
+         Libcalls.getRuntimeLibcallImplDefList()) {
       const RuntimeLibcall *Provides = LibCallImpl.getProvides();
       OS << "  ";
       Provides->emitEnumEntry(OS);
@@ -533,6 +340,7 @@ const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
       LibCallImpl.emitEnumEntry(OS);
       OS << '\n';
     }
+
     OS << "};\n\n";
   }
 
@@ -544,11 +352,8 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
   OS << "void llvm::RTLIB::RuntimeLibcallsInfo::setTargetRuntimeLibcallSets("
         "const llvm::Triple &TT, ExceptionHandling ExceptionModel, "
         "FloatABI::ABIType FloatABI, EABI EABIVersion, "
-        "StringRef ABIName) {\n"
-        "  struct LibcallImplPair {\n"
-        "    RTLIB::Libcall Func;\n"
-        "    RTLIB::LibcallImpl Impl;\n"
-        "  };\n";
+        "StringRef ABIName) {\n";
+
   ArrayRef<const Record *> AllLibs =
       Records.getAllDerivedDefinitions("SystemRuntimeLibrary");
 
@@ -579,7 +384,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
              std::pair<std::vector<const Record *>, const Record *>>
         Func2Preds;
     Sets.addExpander("LibcallImpls", std::make_unique<LibcallPredicateExpander>(
-                                         *this, Func2Preds));
+                                         Libcalls, Func2Preds));
 
     const SetTheory::RecVec *Elements =
         Sets.expand(R->getValueAsDef("MemberList"));
@@ -592,11 +397,12 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
     constexpr unsigned BitsPerStorageElt = 64;
     DenseMap<PredicateWithCC, LibcallsWithCC> Pred2Funcs;
 
-    SmallVector<uint64_t, 32> BitsetValues(
-        divideCeil(RuntimeLibcallImplDefList.size() + 1, BitsPerStorageElt));
+    SmallVector<uint64_t, 32> BitsetValues(divideCeil(
+        Libcalls.getRuntimeLibcallImplDefList().size() + 1, BitsPerStorageElt));
 
     for (const Record *Elt : *Elements) {
-      const RuntimeLibcallImpl *LibCallImpl = getRuntimeLibcallImpl(Elt);
+      const RuntimeLibcallImpl *LibCallImpl =
+          Libcalls.getRuntimeLibcallImpl(Elt);
       if (!LibCallImpl) {
         PrintError(R, "entry for SystemLibrary is not a RuntimeLibcallImpl");
         PrintNote(Elt->getLoc(), "invalid entry `" + Elt->getName() + "`");
@@ -703,7 +509,7 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
       Funcs.erase(UniqueI, Funcs.end());
 
       OS << indent(IndentDepth + 2)
-         << "static const LibcallImplPair LibraryCalls";
+         << "static const RTLIB::LibcallImpl LibraryCalls";
       SubsetPredicate.emitTableVariableNameSuffix(OS);
       if (FuncsWithCC.CallingConv)
         OS << '_' << FuncsWithCC.CallingConv->getName();
@@ -711,18 +517,18 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
       OS << "[] = {\n";
       for (const RuntimeLibcallImpl *LibCallImpl : Funcs) {
         OS << indent(IndentDepth + 6);
-        LibCallImpl->emitTableEntry(OS);
+        LibCallImpl->emitEnumEntry(OS);
+        OS << ", // " << LibCallImpl->getLibcallFuncName() << '\n';
       }
 
       OS << indent(IndentDepth + 2) << "};\n\n"
          << indent(IndentDepth + 2)
-         << "for (const auto [Func, Impl] : LibraryCalls";
+         << "for (const RTLIB::LibcallImpl Impl : LibraryCalls";
       SubsetPredicate.emitTableVariableNameSuffix(OS);
       if (FuncsWithCC.CallingConv)
         OS << '_' << FuncsWithCC.CallingConv->getName();
 
-      OS << ") {\n"
-         << indent(IndentDepth + 4) << "setLibcallImpl(Func, Impl);\n";
+      OS << ") {\n" << indent(IndentDepth + 4) << "setAvailable(Impl);\n";
 
       if (FuncsWithCC.CallingConv) {
         StringRef CCEnum =
@@ -759,44 +565,10 @@ void RuntimeLibcallEmitter::run(raw_ostream &OS) {
   emitGetInitRuntimeLibcallNames(OS);
 
   {
-    IfDefEmitter IfDef(OS, "GET_SET_TARGET_RUNTIME_LIBCALL_SETS");
+    IfDefEmitter IfDef(OS, "GET_RUNTIME_LIBCALLS_INFO");
     emitSystemRuntimeLibrarySetCalls(OS);
   }
 }
 
-void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
-                                      SetTheory::RecSet &Elts) {
-  assert(Def->isSubClassOf("LibcallImpls"));
-
-  SetTheory::RecSet TmpElts;
-
-  ST.evaluate(Def->getValueInit("MemberList"), TmpElts, Def->getLoc());
-
-  Elts.insert(TmpElts.begin(), TmpElts.end());
-
-  AvailabilityPredicate AP(Def->getValueAsDef("AvailabilityPredicate"));
-  const Record *CCClass = Def->getValueAsOptionalDef("CallingConv");
-
-  // This is assuming we aren't conditionally applying a calling convention to
-  // some subsets, and not another, but this doesn't appear to be used.
-
-  for (const Record *LibcallImplDef : TmpElts) {
-    const RuntimeLibcallImpl *LibcallImpl =
-        LibcallEmitter.getRuntimeLibcallImpl(LibcallImplDef);
-    if (!AP.isAlwaysAvailable() || CCClass) {
-      auto [It, Inserted] = Func2Preds.insert({LibcallImpl, {{}, CCClass}});
-      if (!Inserted) {
-        PrintError(
-            Def,
-            "combining nested libcall set predicates currently unhandled: '" +
-                LibcallImpl->getLibcallFuncName() + "'");
-      }
-
-      It->second.first.push_back(AP.getDef());
-      It->second.second = CCClass;
-    }
-  }
-}
-
 static TableGen::Emitter::OptClass<RuntimeLibcallEmitter>
     X("gen-runtime-libcalls", "Generate RuntimeLibcalls");
diff --git a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
index 64f03dae83e7d..dd18d29e6c676 100644
--- a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
+++ b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
@@ -10,6 +10,7 @@
 #include "Common/CodeGenDAGPatterns.h" // For SDNodeInfo.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -129,9 +130,8 @@ SDNodeInfoEmitter::SDNodeInfoEmitter(const RecordKeeper &RK)
 }
 
 void SDNodeInfoEmitter::emitEnum(raw_ostream &OS) const {
-  OS << "#ifdef GET_SDNODE_ENUM\n";
-  OS << "#undef GET_SDNODE_ENUM\n\n";
-  OS << "namespace llvm::" << TargetSDNodeNamespace << " {\n\n";
+  IfDefEmitter IfDef(OS, "GET_SDNODE_ENUM");
+  NamespaceEmitter NS(OS, "llvm::" + TargetSDNodeNamespace);
 
   if (!NodesByName.empty()) {
     StringRef FirstName = NodesByName.begin()->first;
@@ -145,14 +145,11 @@ void SDNodeInfoEmitter::emitEnum(raw_ostream &OS) const {
 
     OS << "};\n\n";
     OS << "static constexpr unsigned GENERATED_OPCODE_END = " << LastName
-       << " + 1;\n\n";
+       << " + 1;\n";
   } else {
     OS << "static constexpr unsigned GENERATED_OPCODE_END = "
-          "ISD::BUILTIN_OP_END;\n\n";
+          "ISD::BUILTIN_OP_END;\n";
   }
-
-  OS << "} // namespace llvm::" << TargetSDNodeNamespace << "\n\n";
-  OS << "#endif // GET_SDNODE_ENUM\n\n";
 }
 
 std::vector<unsigned> SDNodeInfoEmitter::emitNodeNames(raw_ostream &OS) const {
@@ -324,9 +321,8 @@ static void emitDesc(raw_ostream &OS, StringRef EnumName,
 void SDNodeInfoEmitter::emitDescs(raw_ostream &OS) const {
   StringRef TargetName = Target.getName();
 
-  OS << "#ifdef GET_SDNODE_DESC\n";
-  OS << "#undef GET_SDNODE_DESC\n\n";
-  OS << "namespace llvm {\n";
+  IfDefEmitter IfDef(OS, "GET_SDNODE_DESC");
+  NamespaceEmitter NS(OS, "llvm");
 
   std::vector<unsigned> NameOffsets = emitNodeNames(OS);
   std::vector<std::pair<unsigned, unsigned>> ConstraintOffsetsAndCounts =
@@ -343,11 +339,8 @@ void SDNodeInfoEmitter::emitDescs(raw_ostream &OS) const {
 
   OS << formatv("static const SDNodeInfo {0}GenSDNodeInfo(\n"
                 "    /*NumOpcodes=*/{1}, {0}SDNodeDescs,\n"
-                "    {0}SDNodeNames, {0}SDTypeConstraints);\n\n",
+                "    {0}SDNodeNames, {0}SDTypeConstraints);\n",
                 TargetName, NodesByName.size());
-
-  OS << "} // namespace llvm\n\n";
-  OS << "#endif // GET_SDNODE_DESC\n\n";
 }
 
 void SDNodeInfoEmitter::run(raw_ostream &OS) const {
diff --git a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
index 393309ee39bfe..a261f2866be47 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn
@@ -12,7 +12,7 @@ static_library("Passes") {
     "//llvm/utils/gn/build/libs/pthread",
   ]
   sources = [
-    "ADRRelaxationPass.cpp",
+    "AArch64RelaxationPass.cpp",
     "Aligner.cpp",
     "AllocCombiner.cpp",
     "AsmDump.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 2f84999621e1b..3c3fdf7e16885 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -82,6 +82,7 @@ static_library("bugprone") {
     "SmartPtrArrayMismatchCheck.cpp",
     "SpuriouslyWakeUpFunctionsCheck.cpp",
     "StandaloneEmptyCheck.cpp",
+    "StdNamespaceModificationCheck.cpp",
     "StringConstructorCheck.cpp",
     "StringIntegerAssignmentCheck.cpp",
     "StringLiteralWithEmbeddedNulCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index ec642b6afad66..1eae289143b5b 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,7 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "DontModifyStdNamespaceCheck.cpp",
     "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
     "MutatingCopyCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
index 065fc6cdd74a3..bd8d9610c2a4a 100644
--- a/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/CodeGen/BUILD.gn
@@ -17,6 +17,7 @@ unittest("ClangCodeGenTests") {
     "BufferSourceTest.cpp",
     "CheckTargetFeaturesTest.cpp",
     "CodeGenExternalTest.cpp",
+    "DemangleTrapReasonInDebugInfo.cpp",
     "TBAAMetadataTest.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
index 9848efef70568..fa99fa8649caf 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/LanguageRuntime/CPlusPlus/BUILD.gn
@@ -1,10 +1,16 @@
 static_library("CPlusPlus") {
   output_name = "lldbPluginCPPRuntime"
-  configs += [ "//llvm/utils/gn/build:lldb_code" ]
+  configs += [
+    "//llvm/utils/gn/build:clang_code",
+    "//llvm/utils/gn/build:lldb_code",
+  ]
   deps = [
     "//lldb/source/Core",
     "//lldb/source/Symbol",
     "//lldb/source/Target",
   ]
-  sources = [ "CPPLanguageRuntime.cpp" ]
+  sources = [
+    "CPPLanguageRuntime.cpp",
+    "VerboseTrapFrameRecognizer.cpp",
+  ]
 }
diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
index a863baf912051..783eb96283596 100644
--- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
@@ -105,6 +105,5 @@ static_library("Target") {
     "UnixSignals.cpp",
     "UnwindAssembly.cpp",
     "UnwindLLDB.cpp",
-    "VerboseTrapFrameRecognizer.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
index 1a890f6733597..a234d2be67f66 100644
--- a/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/BinaryFormat/BUILD.gn
@@ -12,7 +12,6 @@ static_library("BinaryFormat") {
     "ELF.cpp",
     "MachO.cpp",
     "Magic.cpp",
-    "Minidump.cpp",
     "MsgPackDocument.cpp",
     "MsgPackDocumentYAML.cpp",
     "MsgPackReader.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 444670212cafb..eb41df208941a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -88,6 +88,7 @@ static_library("CodeGen") {
     "LatencyPriorityQueue.cpp",
     "LazyMachineBlockFrequencyInfo.cpp",
     "LexicalScopes.cpp",
+    "LibcallLoweringInfo.cpp",
     "LiveDebugValues/InstrRefBasedImpl.cpp",
     "LiveDebugValues/LiveDebugValues.cpp",
     "LiveDebugValues/VarLocBasedImpl.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index a1f5b475e2096..ad72c0069237d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -151,6 +151,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVMoveMerger.cpp",
     "RISCVOptWInstrs.cpp",
     "RISCVPostRAExpandPseudoInsts.cpp",
+    "RISCVPromoteConstant.cpp",
     "RISCVPushPopOptimizer.cpp",
     "RISCVRedundantCopyElimination.cpp",
     "RISCVRegisterInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
index 43916cef756ff..918132b38b6ed 100644
--- a/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/Basic/BUILD.gn
@@ -10,6 +10,7 @@ source_set("Basic") {
     "DirectiveEmitter.cpp",
     "IntrinsicEmitter.cpp",
     "RISCVTargetDefEmitter.cpp",
+    "RuntimeLibcalls.cpp",
     "RuntimeLibcallsEmitter.cpp",
     "SDNodeProperties.cpp",
     "TableGen.cpp",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 10f0cc254ea97..80bc0e5986e51 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -949,7 +949,7 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
   }];
 
   string llvmBuilder = [{
-    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+    auto [id, args] = NVVM::MBarrierTestWaitOp::getIntrinsicIDAndArgs(
                       *op, moduleTranslation, builder);
     $res = createIntrinsicCall(builder, id, args);
   }];
diff --git a/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir b/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir
new file mode 100644
index 0000000000000..9bb3b082777fd
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/mbarriers.mlir
@@ -0,0 +1,116 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
+  // CHECK-LABEL: define void @cp_async_mbarrier_arrive(ptr addrspace(3) %0, ptr %1) {
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %1)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %1)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
+  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
+  llvm.return
+}
+
+llvm.func @mbarrier_init_generic(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_init_generic(ptr %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.init(ptr %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr, i32
+  llvm.return
+}
+
+llvm.func @mbarrier_init_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_init_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.init.shared(ptr addrspace(3) %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  nvvm.mbarrier.init %barrier, %count : !llvm.ptr<3>, i32
+  llvm.return
+}
+
+llvm.func @mbarrier_inval_generic(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_inval_generic(ptr %0) {
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.inval(ptr %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.mbarrier.inval %barrier : !llvm.ptr
+  llvm.return
+}
+
+llvm.func @mbarrier_inval_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_inval_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: call void @llvm.nvvm.mbarrier.inval.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  nvvm.mbarrier.inval %barrier : !llvm.ptr<3>
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_arrive(ptr %0) {
+  // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive(ptr %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr  -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_arrive_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i64 @llvm.nvvm.mbarrier.arrive.shared(ptr addrspace(3) %0)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) {
+  // CHECK-LABEL: define void @mbarrier_arrive_nocomplete(ptr %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete(ptr %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr, i32 -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) {
+  // CHECK-LABEL: define void @mbarrier_arrive_nocomplete_shared(ptr addrspace(3) %0) {
+  // CHECK-NEXT: %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %3 = call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(ptr addrspace(3) %0, i32 %2)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32  -> i64
+  llvm.return
+}
+
+llvm.func @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 {
+  // CHECK-LABEL: define i1 @mbarrier_test_wait(ptr %0, i64 %1) {
+  // CHECK-NEXT: %3 = call i1 @llvm.nvvm.mbarrier.test.wait(ptr %0, i64 %1)
+  // CHECK-NEXT: ret i1 %3
+  // CHECK-NEXT: }
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr, i64 -> i1
+  llvm.return %isComplete : i1
+}
+
+llvm.func @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) {
+  // CHECK-LABEL: define void @mbarrier_test_wait_shared(ptr addrspace(3) %0, i64 %1) {
+  // CHECK-NEXT: %3 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-NEXT: %4 = call i1 @llvm.nvvm.mbarrier.test.wait.shared(ptr addrspace(3) %0, i64 %1)
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  %count = nvvm.read.ptx.sreg.ntid.x : i32
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 3fc09f371a347..1ec55408e97a5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -531,19 +531,6 @@ llvm.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32)
   llvm.return
 }
 
-// CHECK-LABEL: @cp_async_mbarrier_arrive
-llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.ptr) {
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
-  // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
-  llvm.return
-}
-
 // CHECK-LABEL: @llvm_nvvm_setmaxregister
 llvm.func @llvm_nvvm_setmaxregister() {
   // CHECK: call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 256)
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index cb08397c201f2..f01d3decd9a1c 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -932,13 +932,14 @@ static void runIslScheduleOptimizer(
     POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
     return;
   } else if (isl_ctx_last_error(Ctx) != isl_error_none) {
-    const char *File = isl_ctx_last_error_file(Ctx);
-    int Line = isl_ctx_last_error_line(Ctx);
-    const char *Msg = isl_ctx_last_error_msg(Ctx);
-    POLLY_DEBUG(
-        dbgs()
-        << "ISL reported an error during the computation of a new schedule at "
-        << File << ":" << Line << ": " << Msg);
+    POLLY_DEBUG({
+      const char *File = isl_ctx_last_error_file(Ctx);
+      int Line = isl_ctx_last_error_line(Ctx);
+      const char *Msg = isl_ctx_last_error_msg(Ctx);
+      dbgs() << "ISL reported an error during the computation of a new "
+                "schedule at "
+             << File << ":" << Line << ": " << Msg;
+    });
     isl_ctx_reset_error(Ctx);
     return;
   } else if (Schedule.is_null()) {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 8d225d63cdf3e..b65fe64acdea0 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1805,6 +1805,7 @@ libc_support_library(
         ":__support_cpp_optional",
         ":__support_macros_attributes",
         ":__support_macros_config",
+        ":types_wchar_t",
         ":types_wint_t",
     ],
 )
@@ -1859,6 +1860,7 @@ libc_function(
     hdrs = ["src/ctype/isalnum.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1869,6 +1871,7 @@ libc_function(
     hdrs = ["src/ctype/isalpha.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1909,6 +1912,7 @@ libc_function(
     hdrs = ["src/ctype/isdigit.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1919,6 +1923,7 @@ libc_function(
     hdrs = ["src/ctype/isgraph.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1929,6 +1934,7 @@ libc_function(
     hdrs = ["src/ctype/islower.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1949,6 +1955,7 @@ libc_function(
     hdrs = ["src/ctype/ispunct.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1959,6 +1966,7 @@ libc_function(
     hdrs = ["src/ctype/isspace.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1969,6 +1977,7 @@ libc_function(
     hdrs = ["src/ctype/isupper.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1979,6 +1988,7 @@ libc_function(
     hdrs = ["src/ctype/isxdigit.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -1999,6 +2009,7 @@ libc_function(
     hdrs = ["src/ctype/tolower.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )
@@ -2009,6 +2020,7 @@ libc_function(
     hdrs = ["src/ctype/toupper.h"],
     deps = [
         ":__support_common",
+        ":__support_cpp_limits",
         ":__support_ctype_utils",
     ],
 )