diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
index a2d3d3ff1512d..a7b74944690b4 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
@@ -73,7 +73,9 @@ CoroutineHostileRAIICheck::CoroutineHostileRAIICheck(StringRef Name,
       RAIITypesList(utils::options::parseStringList(
           Options.get("RAIITypesList", "std::lock_guard;std::scoped_lock"))),
       AllowedAwaitablesList(utils::options::parseStringList(
-          Options.get("AllowedAwaitablesList", ""))) {}
+          Options.get("AllowedAwaitablesList", ""))),
+      AllowedCallees(
+          utils::options::parseStringList(Options.get("AllowedCallees", ""))) {}
 
 void CoroutineHostileRAIICheck::registerMatchers(MatchFinder *Finder) {
   // A suspension happens with co_await or co_yield.
@@ -81,7 +83,9 @@ void CoroutineHostileRAIICheck::registerMatchers(MatchFinder *Finder) {
                                     hasAttr(attr::Kind::ScopedLockable)))))
                             .bind("scoped-lockable");
   auto OtherRAII = varDecl(typeWithNameIn(RAIITypesList)).bind("raii");
-  auto AllowedSuspend = awaitable(typeWithNameIn(AllowedAwaitablesList));
+  auto AllowedSuspend = awaitable(
+      anyOf(typeWithNameIn(AllowedAwaitablesList),
+            callExpr(callee(functionDecl(hasAnyName(AllowedCallees))))));
   Finder->addMatcher(
       expr(anyOf(coawaitExpr(unless(AllowedSuspend)), coyieldExpr()),
            forEachPrevStmt(
@@ -111,5 +115,7 @@ void CoroutineHostileRAIICheck::storeOptions(
                 utils::options::serializeStringList(RAIITypesList));
   Options.store(Opts, "SafeAwaitableList",
                 utils::options::serializeStringList(AllowedAwaitablesList));
+  Options.store(Opts, "SafeCallees",
+                utils::options::serializeStringList(AllowedCallees));
 }
 } // namespace clang::tidy::misc
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
index 768b62ef07f90..12ad1b1e0e220 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h
@@ -46,6 +46,9 @@ class CoroutineHostileRAIICheck : public ClangTidyCheck {
   // List of fully qualified awaitable types which are considered safe to
   // co_await.
   std::vector<StringRef> AllowedAwaitablesList;
+  // List of callees whose return values are considered safe to directly
+  // co_await.
+  std::vector<StringRef> AllowedCallees;
 };
 
 } // namespace clang::tidy::misc
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8637a9ab6d9f6..f25c4cacdacb7 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -423,6 +423,11 @@ Changes in existing checks
   positives on return of non-const pointer and fix false positives on
   pointer-to-member operator.
 
+- Improved :doc:`misc-coroutine-hostile-raii
+  <clang-tidy/checks/misc/coroutine-hostile-raii>` check by adding the option
+  `AllowedCallees`, that allows exempting safely awaitable callees from the
+  check.
+
 - Improved :doc:`misc-header-include-cycle
   <clang-tidy/checks/misc/header-include-cycle>` check performance.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
index 0b054e4e20bd6..be80d39e4abf9 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst
@@ -81,3 +81,23 @@ Options
     Eg: `my::safe::awaitable;other::awaitable`
     Default is an empty string.
 
+.. option:: AllowedCallees
+
+    A semicolon-separated list of callee function names which can
+    be safely awaited while having hostile RAII objects in scope.
+    Example usage:
+
+    .. code-block:: c++
+
+      // Consider option AllowedCallees = "noop"
+      task noop() { co_return; }
+
+      task coro() {
+        // This persists across the co_await but is not flagged
+        // because the awaitable is considered safe to await on.
+        const std::lock_guard l(&mu_);
+        co_await noop();
+      }
+
+    Eg: `my::safe::await;other::await`
+    Default is an empty string.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
index c23c355dac1b2..ec6ddec56e1f2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
@@ -1,7 +1,8 @@
 // RUN: %check_clang_tidy -std=c++20 %s misc-coroutine-hostile-raii %t \
 // RUN:   -config="{CheckOptions: {\
 // RUN:             misc-coroutine-hostile-raii.RAIITypesList: 'my::Mutex; ::my::other::Mutex', \
-// RUN:             misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable' \
+// RUN:             misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable', \
+// RUN:             misc-coroutine-hostile-raii.AllowedCallees: 'safe::AwaitFunc; ::safe::Obj::AwaitMethod' \
 // RUN:             }}"
 
 namespace std {
@@ -145,12 +146,18 @@ namespace safe {
   void await_suspend(std::coroutine_handle<>) noexcept {}
   void await_resume() noexcept {}
 };
+  std::suspend_always AwaitFunc();
+  struct Obj {
+    std::suspend_always AwaitMethod();
+  };
 } // namespace safe
 ReturnObject RAIISafeSuspendTest() {
   absl::Mutex a;
   co_await safe::awaitable{};
   using other = safe::awaitable;
   co_await other{};
+  co_await safe::AwaitFunc();
+  co_await safe::Obj().AwaitMethod();
 } 
 
 // ================================================================================
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 3288f5b12c77e..6c7e3d055456a 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -465,6 +465,15 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return createCompare(ptr.getLoc(), cir::CmpOpKind::eq, ptr, nullPtr);
   }
 
+  mlir::Value createAddrSpaceCast(mlir::Location loc, mlir::Value src,
+                                  mlir::Type newTy) {
+    return createCast(loc, cir::CastKind::address_space, src, newTy);
+  }
+
+  mlir::Value createAddrSpaceCast(mlir::Value src, mlir::Type newTy) {
+    return createAddrSpaceCast(src.getLoc(), src, newTy);
+  }
+
   //===--------------------------------------------------------------------===//
   // Binary Operators
   //===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.h b/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
index 45f646f1c9dfa..939e774a6ea67 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
@@ -13,7 +13,9 @@
 #ifndef CLANG_CIR_DIALECT_IR_CIRTYPES_H
 #define CLANG_CIR_DIALECT_IR_CIRTYPES_H
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "clang/Basic/AddressSpaces.h"
@@ -38,6 +40,15 @@ bool isValidFundamentalIntWidth(unsigned width);
 /// void, or abstract types.
 bool isSized(mlir::Type ty);
 
+//===----------------------------------------------------------------------===//
+// AddressSpace helpers
+//===----------------------------------------------------------------------===//
+cir::TargetAddressSpaceAttr toCIRTargetAddressSpace(mlir::MLIRContext &context,
+                                                    clang::LangAS langAS);
+
+bool isMatchingAddressSpace(cir::TargetAddressSpaceAttr cirAS,
+                            clang::LangAS as);
+
 } // namespace cir
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index e147d2ba6087e..51787d914e1ec 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -147,6 +147,13 @@ class CompilerInvocationBase {
   }
   /// @}
 
+  /// Visitation.
+  /// @{
+  /// Visits paths stored in the invocation. The callback may return true to
+  /// short-circuit the visitation, or return false to continue visiting.
+  void visitPaths(llvm::function_ref<bool(StringRef)> Callback) const;
+  /// @}
+
   /// Command line generation.
   /// @{
   using StringAllocator = llvm::function_ref<const char *(const Twine &)>;
@@ -181,6 +188,12 @@ class CompilerInvocationBase {
   /// This is a (less-efficient) wrapper over generateCC1CommandLine().
   std::vector<std::string> getCC1CommandLine() const;
 
+protected:
+  /// Visits paths stored in the invocation. This is generally unsafe to call
+  /// directly, and each sub-class need to ensure calling this doesn't violate
+  /// its invariants.
+  void visitPathsImpl(llvm::function_ref<bool(std::string &)> Predicate);
+
 private:
   /// Generate command line options from DiagnosticOptions.
   static void GenerateDiagnosticArgs(const DiagnosticOptions &Opts,
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index c919a53ae089e..ba7da56cb9fce 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -241,6 +241,8 @@ class FrontendInputFile {
   /// Whether we're dealing with a 'system' input (vs. a 'user' input).
   bool IsSystem = false;
 
+  friend class CompilerInvocationBase;
+
 public:
   FrontendInputFile() = default;
   FrontendInputFile(StringRef File, InputKind Kind, bool IsSystem = false)
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index c18f158525e85..0f868c1c7c5da 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -151,14 +151,14 @@ struct UnsafeQualTypeDenseMapInfo {
 };
 
 /// An ID number that refers to a macro in an AST file.
-using MacroID = uint32_t;
+using MacroID = uint64_t;
 
 /// A global ID number that refers to a macro in an AST file.
-using GlobalMacroID = uint32_t;
+using GlobalMacroID = uint64_t;
 
 /// A local to a module ID number that refers to a macro in an
 /// AST file.
-using LocalMacroID = uint32_t;
+using LocalMacroID = uint64_t;
 
 /// The number of predefined macro IDs.
 const unsigned int NUM_PREDEF_MACRO_IDS = 1;
@@ -179,7 +179,7 @@ using CXXCtorInitializersID = uint32_t;
 
 /// An ID number that refers to an entity in the detailed
 /// preprocessing record.
-using PreprocessedEntityID = uint32_t;
+using PreprocessedEntityID = uint64_t;
 
 /// An ID number that refers to a submodule in a module file.
 using SubmoduleID = uint32_t;
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index af856a8097ab1..4d6fa585ebd45 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -800,14 +800,6 @@ class ASTReader
   /// files.
   llvm::DenseSet<LoadedMacroInfo> LoadedUndefs;
 
-  using GlobalMacroMapType =
-      ContinuousRangeMap<serialization::MacroID, ModuleFile *, 4>;
-
-  /// Mapping from global macro IDs to the module in which the
-  /// macro resides along with the offset that should be added to the
-  /// global macro ID to produce a local ID.
-  GlobalMacroMapType GlobalMacroMap;
-
   /// A vector containing submodules that have already been loaded.
   ///
   /// This vector is indexed by the Submodule ID (-1). NULL submodule entries
@@ -1655,8 +1647,7 @@ class ASTReader
 
   /// Returns the first preprocessed entity ID that begins or ends after
   /// \arg Loc.
-  serialization::PreprocessedEntityID
-  findPreprocessedEntity(SourceLocation Loc, bool EndsAfter) const;
+  unsigned findPreprocessedEntity(SourceLocation Loc, bool EndsAfter) const;
 
   /// Find the next module that contains entities and return the ID
   /// of the first entry.
@@ -1664,9 +1655,8 @@ class ASTReader
   /// \param SLocMapI points at a chunk of a module that contains no
   /// preprocessed entities or the entities it contains are not the
   /// ones we are looking for.
-  serialization::PreprocessedEntityID
-    findNextPreprocessedEntity(
-                        GlobalSLocOffsetMapType::const_iterator SLocMapI) const;
+  unsigned findNextPreprocessedEntity(
+      GlobalSLocOffsetMapType::const_iterator SLocMapI) const;
 
   /// Returns (ModuleFile, Local index) pair for \p GlobalIndex of a
   /// preprocessed entity.
@@ -1748,6 +1738,14 @@ class ASTReader
   std::pair<ModuleFile *, unsigned>
   translateIdentifierIDToIndex(serialization::IdentifierID ID) const;
 
+  /// Translate an \param MacroID ID to the index of MacrosLoaded
+  /// array and the corresponding module file.
+  std::pair<ModuleFile *, unsigned>
+  translateMacroIDToIndex(serialization::MacroID ID) const;
+
+  unsigned translatePreprocessedEntityIDToIndex(
+      serialization::PreprocessedEntityID ID) const;
+
   /// Translate an \param TypeID ID to the index of TypesLoaded
   /// array and the corresponding module file.
   std::pair<ModuleFile *, unsigned>
@@ -2163,6 +2161,14 @@ class ASTReader
   LocalDeclID mapGlobalIDToModuleFileGlobalID(ModuleFile &M,
                                               GlobalDeclID GlobalID);
 
+  /// Reads a macro ID from the given position in a record in the
+  /// given module.
+  ///
+  /// \returns The declaration ID read from the record, adjusted to a global
+  /// Macro ID.
+  serialization::MacroID
+  ReadMacroID(ModuleFile &F, const RecordDataImpl &Record, unsigned &Idx);
+
   /// Reads a declaration ID from the given position in a record in the
   /// given module.
   ///
@@ -2388,7 +2394,8 @@ class ASTReader
 
   /// Retrieve the global macro ID corresponding to the given local
   /// ID within the given module file.
-  serialization::MacroID getGlobalMacroID(ModuleFile &M, unsigned LocalID);
+  serialization::MacroID getGlobalMacroID(ModuleFile &M,
+                                          serialization::MacroID LocalID);
 
   /// Read the source location entry with index ID.
   bool ReadSLocEntry(int ID) override;
@@ -2572,8 +2579,8 @@ class ASTReader
 
   /// Determine the global preprocessed entity ID that corresponds to
   /// the given local ID within the given module.
-  serialization::PreprocessedEntityID
-  getGlobalPreprocessedEntityID(ModuleFile &M, unsigned LocalID) const;
+  serialization::PreprocessedEntityID getGlobalPreprocessedEntityID(
+      ModuleFile &M, serialization::PreprocessedEntityID LocalID) const;
 
   /// Add a macro to deserialize its macro directive history.
   ///
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 28c3e55864057..c77c98dffc39f 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -782,6 +782,10 @@ class ASTWriter : public ASTDeserializationListener,
   void AddLookupOffsets(const LookupBlockOffsets &Offsets,
                         RecordDataImpl &Record);
 
+  /// Emit a reference to a macro.
+  void AddMacroRef(MacroInfo *MI, const IdentifierInfo *Name,
+                   RecordDataImpl &Record);
+
   /// Emit a reference to a declaration.
   void AddDeclRef(const Decl *D, RecordDataImpl &Record);
   // Emit a reference to a declaration if the declaration was emitted.
diff --git a/clang/include/clang/Serialization/ModuleFile.h b/clang/include/clang/Serialization/ModuleFile.h
index f20cb2f9f35ae..783e2ba7a1f94 100644
--- a/clang/include/clang/Serialization/ModuleFile.h
+++ b/clang/include/clang/Serialization/ModuleFile.h
@@ -353,9 +353,6 @@ class ModuleFile {
   /// Base macro ID for macros local to this module.
   serialization::MacroID BaseMacroID = 0;
 
-  /// Remapping table for macro IDs in this module.
-  ContinuousRangeMap<uint32_t, int, 2> MacroRemap;
-
   /// The offset of the start of the set of defined macros.
   uint64_t MacroStartOffset = 0;
 
@@ -372,9 +369,6 @@ class ModuleFile {
   /// this module.
   serialization::PreprocessedEntityID BasePreprocessedEntityID = 0;
 
-  /// Remapping table for preprocessed entity IDs in this module.
-  ContinuousRangeMap<uint32_t, int, 2> PreprocessedEntityRemap;
-
   const PPEntityOffset *PreprocessedEntityOffsets = nullptr;
   unsigned NumPreprocessedEntities = 0;
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index ea51a75324e06..0f2eaa94a5987 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -34,6 +34,8 @@ Origin &OriginManager::addOrigin(OriginID ID, const clang::Expr &E) {
 
 // TODO: Mark this method as const once we remove the call to getOrCreate.
 OriginID OriginManager::get(const Expr &E) {
+  if (auto *ParenIgnored = E.IgnoreParens(); ParenIgnored != &E)
+    return get(*ParenIgnored);
   auto It = ExprToOriginID.find(&E);
   if (It != ExprToOriginID.end())
     return It->second;
diff --git a/clang/lib/CIR/CodeGen/Address.h b/clang/lib/CIR/CodeGen/Address.h
index a67cbad7033a3..c8ce530a7b0d3 100644
--- a/clang/lib/CIR/CodeGen/Address.h
+++ b/clang/lib/CIR/CodeGen/Address.h
@@ -16,9 +16,11 @@
 
 #include "mlir/IR/Value.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/Casting.h"
 
 namespace clang::CIRGen {
 
@@ -114,6 +116,11 @@ class Address {
     return elementType;
   }
 
+  cir::TargetAddressSpaceAttr getAddressSpace() const {
+    auto ptrTy = mlir::dyn_cast<cir::PointerType>(getType());
+    return ptrTy.getAddrSpace();
+  }
+
   clang::CharUnits getAlignment() const { return alignment; }
 
   /// Get the operation which defines this address.
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 4e6a5ee7ee210..eec4d10bb49b8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "CIRGenCall.h"
-#include "CIRGenConstantEmitter.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
 #include "CIRGenValue.h"
@@ -22,6 +21,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -193,11 +193,16 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     // default (e.g. in C / C++ auto vars are in the generic address space). At
     // the AST level this is handled within CreateTempAlloca et al., but for the
     // builtin / dynamic alloca we have to handle it here.
-    assert(!cir::MissingFeatures::addressSpace());
+
+    if (!cir::isMatchingAddressSpace(
+            getCIRAllocaAddressSpace(),
+            e->getType()->getPointeeType().getAddressSpace())) {
+      cgm.errorNYI(e->getSourceRange(), "Non-default address space for alloca");
+    }
 
     // Bitcast the alloca to the expected type.
-    return RValue::get(
-        builder.createBitcast(allocaAddr, builder.getVoidPtrTy()));
+    return RValue::get(builder.createBitcast(
+        allocaAddr, builder.getVoidPtrTy(getCIRAllocaAddressSpace())));
   }
 
   case Builtin::BIcos:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 9bb76894c13f1..c55fcabef0b3f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -22,7 +22,11 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 #include <optional>
 
@@ -1205,7 +1209,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_AtomicToNonAtomic:
   case CK_ToUnion:
   case CK_BaseToDerived:
-  case CK_AddressSpaceConversion:
   case CK_ObjCObjectLValueCast:
   case CK_VectorSplat:
   case CK_ConstructorConversion:
@@ -1219,7 +1222,27 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
 
     return {};
   }
+  case CK_AddressSpaceConversion: {
+    LValue lv = emitLValue(e->getSubExpr());
+    QualType destTy = getContext().getPointerType(e->getType());
+
+    clang::LangAS srcLangAS = e->getSubExpr()->getType().getAddressSpace();
+    cir::TargetAddressSpaceAttr srcAS;
+    if (clang::isTargetAddressSpace(srcLangAS))
+      srcAS = cir::toCIRTargetAddressSpace(getMLIRContext(), srcLangAS);
+    else
+      cgm.errorNYI(
+          e->getSourceRange(),
+          "emitCastLValue: address space conversion from unknown address "
+          "space");
 
+    mlir::Value v = getTargetHooks().performAddrSpaceCast(
+        *this, lv.getPointer(), srcAS, convertType(destTy));
+
+    return makeAddrLValue(Address(v, convertTypeForMem(e->getType()),
+                                  lv.getAddress().getAlignment()),
+                          e->getType(), lv.getBaseInfo());
+  }
   case CK_LValueBitCast: {
     // This must be a reinterpret_cast (or c-style equivalent).
     const auto *ce = cast<ExplicitCastExpr>(e);
@@ -2233,6 +2256,8 @@ Address CIRGenFunction::createTempAllocaWithoutCast(
 
 /// This creates a alloca and inserts it into the entry block. The alloca is
 /// casted to default address space if necessary.
+// TODO(cir): Implement address space casting to match classic codegen's
+// CreateTempAlloca behavior with DestLangAS parameter
 Address CIRGenFunction::createTempAlloca(mlir::Type ty, CharUnits align,
                                          mlir::Location loc, const Twine &name,
                                          mlir::Value arraySize,
@@ -2247,7 +2272,21 @@ Address CIRGenFunction::createTempAlloca(mlir::Type ty, CharUnits align,
   // be different from the type defined by the language. For example,
   // in C++ the auto variables are in the default address space. Therefore
   // cast alloca to the default address space when necessary.
-  assert(!cir::MissingFeatures::addressSpace());
+
+  LangAS allocaAS = alloca.getAddressSpace()
+                        ? clang::getLangASFromTargetAS(
+                              alloca.getAddressSpace().getValue().getUInt())
+                        : clang::LangAS::Default;
+  LangAS dstTyAS = clang::LangAS::Default;
+  if (getCIRAllocaAddressSpace()) {
+    dstTyAS = clang::getLangASFromTargetAS(
+        getCIRAllocaAddressSpace().getValue().getUInt());
+  }
+
+  if (dstTyAS != allocaAS) {
+    getTargetHooks().performAddrSpaceCast(*this, v, getCIRAllocaAddressSpace(),
+                                          builder.getPointerTo(ty, dstTyAS));
+  }
   return Address(v, ty, align);
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 4461875fcf678..1c4f51c11dc5e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -15,6 +15,7 @@
 
 #include "clang/AST/Expr.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/MissingFeatures.h"
 
 #include "mlir/IR/Location.h"
@@ -91,6 +92,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   //===--------------------------------------------------------------------===//
   //                               Utilities
   //===--------------------------------------------------------------------===//
+  mlir::Type convertType(QualType ty) { return cgf.convertType(ty); }
 
   mlir::Value emitComplexToScalarConversion(mlir::Location loc,
                                             mlir::Value value, CastKind kind,
@@ -1888,6 +1890,35 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return cgf.getBuilder().createBitcast(cgf.getLoc(subExpr->getSourceRange()),
                                           src, dstTy);
   }
+  case CK_AddressSpaceConversion: {
+    Expr::EvalResult result;
+    if (subExpr->EvaluateAsRValue(result, cgf.getContext()) &&
+        result.Val.isNullPointer()) {
+      // If e has side effect, it is emitted even if its final result is a
+      // null pointer. In that case, a DCE pass should be able to
+      // eliminate the useless instructions emitted during translating E.
+      if (result.HasSideEffects)
+        Visit(subExpr);
+      return cgf.cgm.emitNullConstant(destTy,
+                                      cgf.getLoc(subExpr->getExprLoc()));
+    }
+
+    clang::QualType srcTy = subExpr->IgnoreImpCasts()->getType();
+    if (srcTy->isPointerType() || srcTy->isReferenceType())
+      srcTy = srcTy->getPointeeType();
+
+    clang::LangAS srcLangAS = srcTy.getAddressSpace();
+    cir::TargetAddressSpaceAttr subExprAS;
+    if (clang::isTargetAddressSpace(srcLangAS))
+      subExprAS = cir::toCIRTargetAddressSpace(cgf.getMLIRContext(), srcLangAS);
+    else
+      cgf.cgm.errorNYI(subExpr->getSourceRange(),
+                       "non-target address space conversion");
+    // Since target may map different address spaces in AST to the same address
+    // space, an address space conversion may end up as a bitcast.
+    return cgf.cgm.getTargetCIRGenInfo().performAddrSpaceCast(
+        cgf, Visit(subExpr), subExprAS, convertType(destTy));
+  }
 
   case CK_AtomicToNonAtomic: {
     cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(),
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b71a28c54dbef..4f5948b6e4467 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -222,6 +222,10 @@ class CIRGenFunction : public CIRGenTypeCache {
   const TargetInfo &getTarget() const { return cgm.getTarget(); }
   mlir::MLIRContext &getMLIRContext() { return cgm.getMLIRContext(); }
 
+  const TargetCIRGenInfo &getTargetHooks() const {
+    return cgm.getTargetCIRGenInfo();
+  }
+
   // ---------------------
   // Opaque value handling
   // ---------------------
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 9f9b2db4771df..c1f2581eb96e3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1424,6 +1424,23 @@ CIRGenModule::getAddrOfConstantStringFromLiteral(const StringLiteral *s,
   return builder.getGlobalViewAttr(ptrTy, gv);
 }
 
+// TODO(cir): this could be a common AST helper for both CIR and LLVM codegen.
+LangAS CIRGenModule::getLangTempAllocaAddressSpace() const {
+  if (getLangOpts().OpenCL)
+    return LangAS::opencl_private;
+
+  // For temporaries inside functions, CUDA treats them as normal variables.
+  // LangAS::cuda_device, on the other hand, is reserved for those variables
+  // explicitly marked with __device__.
+  if (getLangOpts().CUDAIsDevice)
+    return LangAS::Default;
+
+  if (getLangOpts().SYCLIsDevice ||
+      (getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice))
+    errorNYI("SYCL or OpenMP temp address space");
+  return LangAS::Default;
+}
+
 void CIRGenModule::emitExplicitCastExprType(const ExplicitCastExpr *e,
                                             CIRGenFunction *cgf) {
   if (cgf && e->getType()->isVariablyModifiedType())
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 186913d1bac9d..dc28d9e8e9d33 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -297,6 +297,12 @@ class CIRGenModule : public CIRGenTypeCache {
   getAddrOfConstantStringFromLiteral(const StringLiteral *s,
                                      llvm::StringRef name = ".str");
 
+  /// Returns the address space for temporary allocations in the language. This
+  /// ensures that the allocated variable's address space matches the
+  /// expectations of the AST, rather than using the target's allocation address
+  /// space, which may lead to type mismatches in other parts of the IR.
+  LangAS getLangTempAllocaAddressSpace() const;
+
   /// Set attributes which are common to any form of a global definition (alias,
   /// Objective-C method, function, global variable).
   ///
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 03618d4a8a8a6..efc2c6c0ba500 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -404,7 +404,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     const ReferenceType *refTy = cast<ReferenceType>(ty);
     QualType elemTy = refTy->getPointeeType();
     auto pointeeType = convertTypeForMem(elemTy);
-    resultType = builder.getPointerTo(pointeeType);
+    resultType = builder.getPointerTo(pointeeType, elemTy.getAddressSpace());
     assert(resultType && "Cannot get pointer type?");
     break;
   }
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index 62a8c59abe604..377c532e492d9 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -1,5 +1,8 @@
 #include "TargetInfo.h"
 #include "ABIInfo.h"
+#include "CIRGenFunction.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
@@ -68,3 +71,14 @@ bool TargetCIRGenInfo::isNoProtoCallVariadic(
   // For everything else, we just prefer false unless we opt out.
   return false;
 }
+
+mlir::Value TargetCIRGenInfo::performAddrSpaceCast(
+    CIRGenFunction &cgf, mlir::Value v, cir::TargetAddressSpaceAttr srcAddr,
+    mlir::Type destTy, bool isNonNull) const {
+  // Since target may map different address spaces in AST to the same address
+  // space, an address space conversion may end up as a bitcast.
+  if (cir::GlobalOp globalOp = v.getDefiningOp<cir::GlobalOp>())
+    cgf.cgm.errorNYI("Global op addrspace cast");
+  // Try to preserve the source's name to make IR more readable.
+  return cgf.getBuilder().createAddrSpaceCast(v, destTy);
+}
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.h b/clang/lib/CIR/CodeGen/TargetInfo.h
index dbb0312c76040..72682641a460b 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -17,6 +17,7 @@
 #include "ABIInfo.h"
 #include "CIRGenTypes.h"
 #include "clang/Basic/AddressSpaces.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 
 #include <memory>
 #include <utility>
@@ -33,6 +34,8 @@ bool isEmptyFieldForLayout(const ASTContext &context, const FieldDecl *fd);
 /// if the [[no_unique_address]] attribute would have made them empty.
 bool isEmptyRecordForLayout(const ASTContext &context, QualType t);
 
+class CIRGenFunction;
+
 class TargetCIRGenInfo {
   std::unique_ptr<ABIInfo> info;
 
@@ -48,6 +51,15 @@ class TargetCIRGenInfo {
   virtual cir::TargetAddressSpaceAttr getCIRAllocaAddressSpace() const {
     return {};
   }
+  /// Perform address space cast of an expression of pointer type.
+  /// \param V is the value to be casted to another address space.
+  /// \param DestTy is the destination pointer type.
+  /// \param srcAS is theaddress space of \p V.
+  /// \param IsNonNull is the flag indicating \p V is known to be non null.
+  virtual mlir::Value performAddrSpaceCast(CIRGenFunction &cgf, mlir::Value v,
+                                           cir::TargetAddressSpaceAttr srcAddr,
+                                           mlir::Type destTy,
+                                           bool isNonNull = false) const;
 
   /// Determine whether a call to an unprototyped functions under
   /// the given calling convention should use the variadic
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 7ba03ce40140c..9ac5efe0e41c7 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -383,6 +383,16 @@ LogicalResult cir::CastOp::verify() {
   mlir::Type resType = getType();
   mlir::Type srcType = getSrc().getType();
 
+  // Verify address space casts for pointer types. given that
+  // casts for within a different address space are illegal.
+  auto srcPtrTy = mlir::dyn_cast<cir::PointerType>(srcType);
+  auto resPtrTy = mlir::dyn_cast<cir::PointerType>(resType);
+  if (srcPtrTy && resPtrTy && (getKind() != cir::CastKind::address_space))
+    if (srcPtrTy.getAddrSpace() != resPtrTy.getAddrSpace()) {
+      return emitOpError() << "result type address space does not match the "
+                              "address space of the operand";
+    }
+
   if (mlir::isa<cir::VectorType>(srcType) &&
       mlir::isa<cir::VectorType>(resType)) {
     // Use the element type of the vector to verify the cast kind. (Except for
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index f7907c76c8ccb..bb87056048ec5 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -12,11 +12,16 @@
 
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/MLIRContext.h"
+#include "clang/Basic/AddressSpaces.h"
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIRTypesDetails.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 //===----------------------------------------------------------------------===//
@@ -807,6 +812,27 @@ mlir::LogicalResult cir::VectorType::verify(
 // TargetAddressSpace definitions
 //===----------------------------------------------------------------------===//
 
+cir::TargetAddressSpaceAttr
+cir::toCIRTargetAddressSpace(mlir::MLIRContext &context, clang::LangAS langAS) {
+  return cir::TargetAddressSpaceAttr::get(
+      &context,
+      IntegerAttr::get(&context,
+                       llvm::APSInt(clang::toTargetAddressSpace(langAS))));
+}
+
+bool cir::isMatchingAddressSpace(cir::TargetAddressSpaceAttr cirAS,
+                                 clang::LangAS as) {
+  // If there is no CIR target attr, consider it "default" and only match
+  // when the AST address space is LangAS::Default.
+  if (!cirAS)
+    return as == clang::LangAS::Default;
+
+  if (!isTargetAddressSpace(as))
+    return false;
+
+  return cirAS.getValue().getUInt() == toTargetAddressSpace(as);
+}
+
 mlir::ParseResult parseTargetAddressSpace(mlir::AsmParser &p,
                                           cir::TargetAddressSpaceAttr &attr) {
   if (failed(p.parseKeyword("target_address_space")))
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 0782dc1b585c3..88b7adb918b87 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -5409,6 +5409,86 @@ std::string CompilerInvocation::getModuleHash() const {
   return toString(llvm::APInt(64, Hash), 36, /*Signed=*/false);
 }
 
+void CompilerInvocationBase::visitPathsImpl(
+    llvm::function_ref<bool(std::string &)> Predicate) {
+#define RETURN_IF(PATH)                                                        \
+  do {                                                                         \
+    if (Predicate(PATH))                                                       \
+      return;                                                                  \
+  } while (0)
+
+#define RETURN_IF_MANY(PATHS)                                                  \
+  do {                                                                         \
+    if (llvm::any_of(PATHS, Predicate))                                        \
+      return;                                                                  \
+  } while (0)
+
+  auto &HeaderSearchOpts = *this->HSOpts;
+  // Header search paths.
+  RETURN_IF(HeaderSearchOpts.Sysroot);
+  for (auto &Entry : HeaderSearchOpts.UserEntries)
+    if (Entry.IgnoreSysRoot)
+      RETURN_IF(Entry.Path);
+  RETURN_IF(HeaderSearchOpts.ResourceDir);
+  RETURN_IF(HeaderSearchOpts.ModuleCachePath);
+  RETURN_IF(HeaderSearchOpts.ModuleUserBuildPath);
+  for (auto &[Name, File] : HeaderSearchOpts.PrebuiltModuleFiles)
+    RETURN_IF(File);
+  RETURN_IF_MANY(HeaderSearchOpts.PrebuiltModulePaths);
+  RETURN_IF_MANY(HeaderSearchOpts.VFSOverlayFiles);
+
+  // Preprocessor options.
+  auto &PPOpts = *this->PPOpts;
+  RETURN_IF_MANY(PPOpts.MacroIncludes);
+  RETURN_IF_MANY(PPOpts.Includes);
+  RETURN_IF(PPOpts.ImplicitPCHInclude);
+
+  // Frontend options.
+  auto &FrontendOpts = *this->FrontendOpts;
+  for (auto &Input : FrontendOpts.Inputs) {
+    if (Input.isBuffer())
+      continue;
+
+    RETURN_IF(Input.File);
+  }
+  RETURN_IF(FrontendOpts.CodeCompletionAt.FileName);
+  RETURN_IF_MANY(FrontendOpts.ModuleMapFiles);
+  RETURN_IF_MANY(FrontendOpts.ModuleFiles);
+  RETURN_IF_MANY(FrontendOpts.ModulesEmbedFiles);
+  RETURN_IF_MANY(FrontendOpts.ASTMergeFiles);
+  RETURN_IF(FrontendOpts.OverrideRecordLayoutsFile);
+  RETURN_IF(FrontendOpts.StatsFile);
+
+  // Filesystem options.
+  auto &FileSystemOpts = *this->FSOpts;
+  RETURN_IF(FileSystemOpts.WorkingDir);
+
+  // Codegen options.
+  auto &CodeGenOpts = *this->CodeGenOpts;
+  RETURN_IF(CodeGenOpts.DebugCompilationDir);
+  RETURN_IF(CodeGenOpts.CoverageCompilationDir);
+
+  // Sanitizer options.
+  RETURN_IF_MANY(LangOpts->NoSanitizeFiles);
+
+  // Coverage mappings.
+  RETURN_IF(CodeGenOpts.ProfileInstrumentUsePath);
+  RETURN_IF(CodeGenOpts.SampleProfileFile);
+  RETURN_IF(CodeGenOpts.ProfileRemappingFile);
+
+  // Dependency output options.
+  for (auto &ExtraDep : DependencyOutputOpts->ExtraDeps)
+    RETURN_IF(ExtraDep.first);
+}
+
+void CompilerInvocationBase::visitPaths(
+    llvm::function_ref<bool(StringRef)> Callback) const {
+  // The const_cast here is OK, because visitPathsImpl() itself doesn't modify
+  // the invocation, and our callback takes immutable StringRefs.
+  return const_cast<CompilerInvocationBase *>(this)->visitPathsImpl(
+      [&Callback](std::string &Path) { return Callback(StringRef(Path)); });
+}
+
 void CompilerInvocationBase::generateCC1CommandLine(
     ArgumentConsumer Consumer) const {
   llvm::Triple T(getTargetOpts().Triple);
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 823a94ad821e0..b0c7bae46f09e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2228,9 +2228,10 @@ MacroInfo *ASTReader::ReadMacroRecord(ModuleFile &F, uint64_t Offset) {
         // We have a macro definition. Register the association
         PreprocessedEntityID
             GlobalID = getGlobalPreprocessedEntityID(F, Record[NextIndex]);
+        unsigned Index = translatePreprocessedEntityIDToIndex(GlobalID);
         PreprocessingRecord &PPRec = *PP.getPreprocessingRecord();
         PreprocessingRecord::PPEntityID PPID =
-            PPRec.getPPEntityID(GlobalID - 1, /*isLoaded=*/true);
+            PPRec.getPPEntityID(Index, /*isLoaded=*/true);
         MacroDefinitionRecord *PPDef = cast_or_null<MacroDefinitionRecord>(
             PPRec.getPreprocessedEntity(PPID));
         if (PPDef)
@@ -2261,16 +2262,22 @@ MacroInfo *ASTReader::ReadMacroRecord(ModuleFile &F, uint64_t Offset) {
 
 PreprocessedEntityID
 ASTReader::getGlobalPreprocessedEntityID(ModuleFile &M,
-                                         unsigned LocalID) const {
+                                         PreprocessedEntityID LocalID) const {
   if (!M.ModuleOffsetMap.empty())
     ReadModuleOffsetMap(M);
 
-  ContinuousRangeMap<uint32_t, int, 2>::const_iterator
-    I = M.PreprocessedEntityRemap.find(LocalID - NUM_PREDEF_PP_ENTITY_IDS);
-  assert(I != M.PreprocessedEntityRemap.end()
-         && "Invalid index into preprocessed entity index remap");
+  unsigned ModuleFileIndex = LocalID >> 32;
+  LocalID &= llvm::maskTrailingOnes<PreprocessedEntityID>(32);
+  ModuleFile *MF =
+      ModuleFileIndex ? M.TransitiveImports[ModuleFileIndex - 1] : &M;
+  assert(MF && "malformed identifier ID encoding?");
 
-  return LocalID + I->second;
+  if (!ModuleFileIndex) {
+    assert(LocalID >= NUM_PREDEF_PP_ENTITY_IDS);
+    LocalID -= NUM_PREDEF_PP_ENTITY_IDS;
+  }
+
+  return (static_cast<PreprocessedEntityID>(MF->Index + 1) << 32) | LocalID;
 }
 
 OptionalFileEntryRef
@@ -2547,6 +2554,13 @@ void ASTReader::markIdentifierUpToDate(const IdentifierInfo *II) {
     IdentifierGeneration[II] = getGeneration();
 }
 
+MacroID ASTReader::ReadMacroID(ModuleFile &F, const RecordDataImpl &Record,
+                               unsigned &Idx) {
+  uint64_t ModuleFileIndex = Record[Idx++] << 32;
+  uint64_t LocalIndex = Record[Idx++];
+  return getGlobalMacroID(F, (ModuleFileIndex | LocalIndex));
+}
+
 void ASTReader::resolvePendingMacro(IdentifierInfo *II,
                                     const PendingMacroInfo &PMInfo) {
   ModuleFile &M = *PMInfo.M;
@@ -2597,9 +2611,10 @@ void ASTReader::resolvePendingMacro(IdentifierInfo *II,
     case PP_MODULE_MACRO: {
       ModuleMacros.push_back(ModuleMacroRecord());
       auto &Info = ModuleMacros.back();
-      Info.SubModID = getGlobalSubmoduleID(M, Record[0]);
-      Info.MI = getMacro(getGlobalMacroID(M, Record[1]));
-      for (int I = 2, N = Record.size(); I != N; ++I)
+      unsigned Idx = 0;
+      Info.SubModID = getGlobalSubmoduleID(M, Record[Idx++]);
+      Info.MI = getMacro(ReadMacroID(M, Record, Idx));
+      for (int I = Idx, N = Record.size(); I != N; ++I)
         Info.Overrides.push_back(getGlobalSubmoduleID(M, Record[I]));
       continue;
     }
@@ -4111,8 +4126,6 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
       assert(Blob.size() % sizeof(PPEntityOffset) == 0);
       F.NumPreprocessedEntities = Blob.size() / sizeof(PPEntityOffset);
 
-      unsigned LocalBasePreprocessedEntityID = Record[0];
-
       unsigned StartingID;
       if (!PP.getPreprocessingRecord())
         PP.createPreprocessingRecord();
@@ -4127,12 +4140,6 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
         // Introduce the global -> local mapping for preprocessed entities in
         // this module.
         GlobalPreprocessedEntityMap.insert(std::make_pair(StartingID, &F));
-
-        // Introduce the local -> global mapping for preprocessed entities in
-        // this module.
-        F.PreprocessedEntityRemap.insertOrReplace(
-          std::make_pair(LocalBasePreprocessedEntityID,
-            F.BasePreprocessedEntityID - LocalBasePreprocessedEntityID));
       }
 
       break;
@@ -4343,21 +4350,11 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F,
             "duplicate MACRO_OFFSET record in AST file");
       F.MacroOffsets = (const uint32_t *)Blob.data();
       F.LocalNumMacros = Record[0];
-      unsigned LocalBaseMacroID = Record[1];
-      F.MacroOffsetsBase = Record[2] + F.ASTBlockStartOffset;
+      F.MacroOffsetsBase = Record[1] + F.ASTBlockStartOffset;
       F.BaseMacroID = getTotalNumMacros();
 
-      if (F.LocalNumMacros > 0) {
-        // Introduce the global -> local mapping for macros within this module.
-        GlobalMacroMap.insert(std::make_pair(getTotalNumMacros() + 1, &F));
-
-        // Introduce the local -> global mapping for macros within this module.
-        F.MacroRemap.insertOrReplace(
-          std::make_pair(LocalBaseMacroID,
-                         F.BaseMacroID - LocalBaseMacroID));
-
+      if (F.LocalNumMacros > 0)
         MacrosLoaded.resize(MacrosLoaded.size() + F.LocalNumMacros);
-      }
       break;
     }
 
@@ -4463,8 +4460,6 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
   F.ModuleOffsetMap = StringRef();
 
   using RemapBuilder = ContinuousRangeMap<uint32_t, int, 2>::Builder;
-  RemapBuilder MacroRemap(F.MacroRemap);
-  RemapBuilder PreprocessedEntityRemap(F.PreprocessedEntityRemap);
   RemapBuilder SubmoduleRemap(F.SubmoduleRemap);
   RemapBuilder SelectorRemap(F.SelectorRemap);
 
@@ -4494,10 +4489,6 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
 
     ImportedModuleVector.push_back(OM);
 
-    uint32_t MacroIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little>(Data);
-    uint32_t PreprocessedEntityIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t SubmoduleIDOffset =
         endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t SelectorIDOffset =
@@ -4511,9 +4502,6 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
                                     static_cast<int>(BaseOffset - Offset)));
     };
 
-    mapOffset(MacroIDOffset, OM->BaseMacroID, MacroRemap);
-    mapOffset(PreprocessedEntityIDOffset, OM->BasePreprocessedEntityID,
-              PreprocessedEntityRemap);
     mapOffset(SubmoduleIDOffset, OM->BaseSubmoduleID, SubmoduleRemap);
     mapOffset(SelectorIDOffset, OM->BaseSelectorID, SelectorRemap);
   }
@@ -6725,11 +6713,23 @@ SourceRange ASTReader::ReadSkippedRange(unsigned GlobalIndex) {
   return Range;
 }
 
+unsigned
+ASTReader::translatePreprocessedEntityIDToIndex(PreprocessedEntityID ID) const {
+  unsigned ModuleFileIndex = ID >> 32;
+  assert(ModuleFileIndex && "not translating loaded MacroID?");
+  assert(getModuleManager().size() > ModuleFileIndex - 1);
+  ModuleFile &MF = getModuleManager()[ModuleFileIndex - 1];
+
+  ID &= llvm::maskTrailingOnes<PreprocessedEntityID>(32);
+  return MF.BasePreprocessedEntityID + ID;
+}
+
 PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
-  PreprocessedEntityID PPID = Index+1;
   std::pair<ModuleFile *, unsigned> PPInfo = getModulePreprocessedEntity(Index);
   ModuleFile &M = *PPInfo.first;
   unsigned LocalIndex = PPInfo.second;
+  PreprocessedEntityID PPID =
+      (static_cast<PreprocessedEntityID>(M.Index + 1) << 32) | LocalIndex;
   const PPEntityOffset &PPOffs = M.PreprocessedEntityOffsets[LocalIndex];
 
   if (!PP.getPreprocessingRecord()) {
@@ -6777,8 +6777,9 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
     else {
       PreprocessedEntityID GlobalID =
           getGlobalPreprocessedEntityID(M, Record[1]);
-      Def = cast<MacroDefinitionRecord>(
-          PPRec.getLoadedPreprocessedEntity(GlobalID - 1));
+      unsigned Index = translatePreprocessedEntityIDToIndex(GlobalID);
+      Def =
+          cast<MacroDefinitionRecord>(PPRec.getLoadedPreprocessedEntity(Index));
     }
 
     MacroExpansion *ME;
@@ -6831,8 +6832,8 @@ PreprocessedEntity *ASTReader::ReadPreprocessedEntity(unsigned Index) {
 /// \param SLocMapI points at a chunk of a module that contains no
 /// preprocessed entities or the entities it contains are not the ones we are
 /// looking for.
-PreprocessedEntityID ASTReader::findNextPreprocessedEntity(
-                       GlobalSLocOffsetMapType::const_iterator SLocMapI) const {
+unsigned ASTReader::findNextPreprocessedEntity(
+    GlobalSLocOffsetMapType::const_iterator SLocMapI) const {
   ++SLocMapI;
   for (GlobalSLocOffsetMapType::const_iterator
          EndI = GlobalSLocOffsetMap.end(); SLocMapI != EndI; ++SLocMapI) {
@@ -6875,8 +6876,8 @@ struct PPEntityComp {
 
 } // namespace
 
-PreprocessedEntityID ASTReader::findPreprocessedEntity(SourceLocation Loc,
-                                                       bool EndsAfter) const {
+unsigned ASTReader::findPreprocessedEntity(SourceLocation Loc,
+                                           bool EndsAfter) const {
   if (SourceMgr.isLocalSourceLocation(Loc))
     return getTotalNumPreprocessedEntities();
 
@@ -6936,9 +6937,8 @@ std::pair<unsigned, unsigned>
     return std::make_pair(0,0);
   assert(!SourceMgr.isBeforeInTranslationUnit(Range.getEnd(),Range.getBegin()));
 
-  PreprocessedEntityID BeginID =
-      findPreprocessedEntity(Range.getBegin(), false);
-  PreprocessedEntityID EndID = findPreprocessedEntity(Range.getEnd(), true);
+  unsigned BeginID = findPreprocessedEntity(Range.getBegin(), false);
+  unsigned EndID = findPreprocessedEntity(Range.getEnd(), true);
   return std::make_pair(BeginID, EndID);
 }
 
@@ -8963,7 +8963,6 @@ LLVM_DUMP_METHOD void ASTReader::dump() {
   llvm::errs() << "*** PCH/ModuleFile Remappings:\n";
   dumpModuleIDMap("Global bit offset map", GlobalBitOffsetsMap);
   dumpModuleIDMap("Global source location entry map", GlobalSLocEntryMap);
-  dumpModuleIDMap("Global macro map", GlobalMacroMap);
   dumpModuleIDMap("Global submodule map", GlobalSubmoduleMap);
   dumpModuleIDMap("Global selector map", GlobalSelectorMap);
   dumpModuleIDMap("Global preprocessed entity map",
@@ -9746,6 +9745,21 @@ IdentifierID ASTReader::getGlobalIdentifierID(ModuleFile &M, uint64_t LocalID) {
   return ((IdentifierID)(MF->Index + 1) << 32) | LocalID;
 }
 
+std::pair<ModuleFile *, unsigned>
+ASTReader::translateMacroIDToIndex(MacroID ID) const {
+  if (ID == 0)
+    return {nullptr, 0};
+
+  unsigned ModuleFileIndex = ID >> 32;
+  assert(ModuleFileIndex && "not translating loaded MacroID?");
+  assert(getModuleManager().size() > ModuleFileIndex - 1);
+  ModuleFile &MF = getModuleManager()[ModuleFileIndex - 1];
+
+  unsigned LocalID = ID & llvm::maskTrailingOnes<MacroID>(32);
+  assert(LocalID < MF.LocalNumMacros);
+  return {&MF, MF.BaseMacroID + LocalID};
+}
+
 MacroInfo *ASTReader::getMacro(MacroID ID) {
   if (ID == 0)
     return nullptr;
@@ -9755,36 +9769,40 @@ MacroInfo *ASTReader::getMacro(MacroID ID) {
     return nullptr;
   }
 
-  ID -= NUM_PREDEF_MACRO_IDS;
-  if (!MacrosLoaded[ID]) {
-    GlobalMacroMapType::iterator I
-      = GlobalMacroMap.find(ID + NUM_PREDEF_MACRO_IDS);
-    assert(I != GlobalMacroMap.end() && "Corrupted global macro map");
-    ModuleFile *M = I->second;
-    unsigned Index = ID - M->BaseMacroID;
-    MacrosLoaded[ID] =
-        ReadMacroRecord(*M, M->MacroOffsetsBase + M->MacroOffsets[Index]);
+  auto [M, Index] = translateMacroIDToIndex(ID);
+  if (!MacrosLoaded[Index]) {
+    assert(M != nullptr && "Untranslated Macro ID?");
+    assert(Index >= M->BaseMacroID);
+    unsigned LocalIndex = Index - M->BaseMacroID;
+    uint64_t DataOffset = M->MacroOffsetsBase + M->MacroOffsets[LocalIndex];
+    MacrosLoaded[Index] = ReadMacroRecord(*M, DataOffset);
 
     if (DeserializationListener)
-      DeserializationListener->MacroRead(ID + NUM_PREDEF_MACRO_IDS,
-                                         MacrosLoaded[ID]);
+      DeserializationListener->MacroRead(ID, MacrosLoaded[Index]);
   }
 
-  return MacrosLoaded[ID];
+  return MacrosLoaded[Index];
 }
 
-MacroID ASTReader::getGlobalMacroID(ModuleFile &M, unsigned LocalID) {
+MacroID ASTReader::getGlobalMacroID(ModuleFile &M, MacroID LocalID) {
   if (LocalID < NUM_PREDEF_MACRO_IDS)
     return LocalID;
 
   if (!M.ModuleOffsetMap.empty())
     ReadModuleOffsetMap(M);
 
-  ContinuousRangeMap<uint32_t, int, 2>::iterator I
-    = M.MacroRemap.find(LocalID - NUM_PREDEF_MACRO_IDS);
-  assert(I != M.MacroRemap.end() && "Invalid index into macro index remap");
+  unsigned ModuleFileIndex = LocalID >> 32;
+  LocalID &= llvm::maskTrailingOnes<MacroID>(32);
+  ModuleFile *MF =
+      ModuleFileIndex ? M.TransitiveImports[ModuleFileIndex - 1] : &M;
+  assert(MF && "malformed identifier ID encoding?");
 
-  return LocalID + I->second;
+  if (!ModuleFileIndex) {
+    assert(LocalID >= NUM_PREDEF_MACRO_IDS);
+    LocalID -= NUM_PREDEF_MACRO_IDS;
+  }
+
+  return (static_cast<MacroID>(MF->Index + 1) << 32) | LocalID;
 }
 
 serialization::SubmoduleID
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index e4618d60a8acb..e8c0d3f2b4ee9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -2691,7 +2691,7 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
         Record.push_back(VisMD->isPublic());
       }
       ModuleMacroRecord.push_back(getSubmoduleID(WritingModule));
-      ModuleMacroRecord.push_back(getMacroRef(MD->getMacroInfo(), Name));
+      AddMacroRef(MD->getMacroInfo(), Name, ModuleMacroRecord);
       Stream.EmitRecord(PP_MODULE_MACRO, ModuleMacroRecord);
       ModuleMacroRecord.clear();
       EmittedModuleMacros = true;
@@ -2720,7 +2720,7 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
 
         // Emit a record indicating this submodule exports this macro.
         ModuleMacroRecord.push_back(getSubmoduleID(Macro->getOwningModule()));
-        ModuleMacroRecord.push_back(getMacroRef(Macro->getMacroInfo(), Name));
+        AddMacroRef(Macro->getMacroInfo(), Name, ModuleMacroRecord);
         for (auto *M : Macro->overrides())
           ModuleMacroRecord.push_back(getSubmoduleID(M->getOwningModule()));
 
@@ -2819,14 +2819,12 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
   auto Abbrev = std::make_shared<BitCodeAbbrev>();
   Abbrev->Add(BitCodeAbbrevOp(MACRO_OFFSET));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // # of macros
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first ID
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 32));   // base offset
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
 
   unsigned MacroOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   {
     RecordData::value_type Record[] = {MACRO_OFFSET, MacroOffsets.size(),
-                                       FirstMacroID - NUM_PREDEF_MACRO_IDS,
                                        MacroOffsetsBase - ASTBlockStartOffset};
     Stream.EmitRecordWithBlob(MacroOffsetAbbrev, Record, bytes(MacroOffsets));
   }
@@ -2859,9 +2857,7 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec,
     InclusionAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
   }
 
-  unsigned FirstPreprocessorEntityID
-    = (Chain ? PPRec.getNumLoadedPreprocessedEntities() : 0)
-    + NUM_PREDEF_PP_ENTITY_IDS;
+  unsigned FirstPreprocessorEntityID = NUM_PREDEF_PP_ENTITY_IDS;
   unsigned NextPreprocessorEntityID = FirstPreprocessorEntityID;
   RecordData Record;
   for (PreprocessingRecord::iterator E = PPRec.local_begin(),
@@ -2925,13 +2921,10 @@ void ASTWriter::WritePreprocessorDetail(PreprocessingRecord &PPRec,
 
     auto Abbrev = std::make_shared<BitCodeAbbrev>();
     Abbrev->Add(BitCodeAbbrevOp(PPD_ENTITIES_OFFSETS));
-    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // first pp entity
     Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
     unsigned PPEOffsetAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
 
-    RecordData::value_type Record[] = {PPD_ENTITIES_OFFSETS,
-                                       FirstPreprocessorEntityID -
-                                           NUM_PREDEF_PP_ENTITY_IDS};
+    RecordData::value_type Record[] = {PPD_ENTITIES_OFFSETS};
     Stream.EmitRecordWithBlob(PPEOffsetAbbrev, Record,
                               bytes(PreprocessedEntityOffsets));
   }
@@ -6100,9 +6093,6 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot,
 
         // These values should be unique within a chain, since they will be read
         // as keys into ContinuousRangeMaps.
-        writeBaseIDOrNone(M.BaseMacroID, M.LocalNumMacros);
-        writeBaseIDOrNone(M.BasePreprocessedEntityID,
-                          M.NumPreprocessedEntities);
         writeBaseIDOrNone(M.BaseSubmoduleID, M.LocalNumSubmodules);
         writeBaseIDOrNone(M.BaseSelectorID, M.LocalNumSelectors);
       }
@@ -6903,6 +6893,13 @@ void ASTWriter::AddLookupOffsets(const LookupBlockOffsets &Offsets,
   Record.push_back(Offsets.TULocalOffset);
 }
 
+void ASTWriter::AddMacroRef(MacroInfo *MI, const IdentifierInfo *Name,
+                            RecordDataImpl &Record) {
+  MacroID MacroRef = getMacroRef(MI, Name);
+  Record.push_back(MacroRef >> 32);
+  Record.push_back(MacroRef & llvm::maskTrailingOnes<MacroID>(32));
+}
+
 void ASTWriter::AddEmittedDeclRef(const Decl *D, RecordDataImpl &Record) {
   if (!wasDeclEmitted(D))
     return;
@@ -7383,12 +7380,8 @@ void ASTWriter::ReaderInitialized(ASTReader *Reader) {
 
   Chain = Reader;
 
-  // Note, this will get called multiple times, once one the reader starts up
-  // and again each time it's done reading a PCH or module.
-  FirstMacroID = NUM_PREDEF_MACRO_IDS + Chain->getTotalNumMacros();
   FirstSubmoduleID = NUM_PREDEF_SUBMODULE_IDS + Chain->getTotalNumSubmodules();
   FirstSelectorID = NUM_PREDEF_SELECTOR_IDS + Chain->getTotalNumSelectors();
-  NextMacroID = FirstMacroID;
   NextSelectorID = FirstSelectorID;
   NextSubmoduleID = FirstSubmoduleID;
 }
@@ -7416,6 +7409,14 @@ void ASTWriter::IdentifierRead(IdentifierID ID, IdentifierInfo *II) {
 void ASTWriter::MacroRead(serialization::MacroID ID, MacroInfo *MI) {
   // Always keep the highest ID. See \p TypeRead() for more information.
   MacroID &StoredID = MacroIDs[MI];
+  unsigned OriginalModuleFileIndex = StoredID >> 32;
+
+  // Always keep the local macro ID. See \p TypeRead() for more information.
+  if (OriginalModuleFileIndex == 0 && StoredID)
+    return;
+
+  // Otherwise, keep the highest ID since the module file comes later has
+  // higher module file indexes.
   if (ID > StoredID)
     StoredID = ID;
 }
diff --git a/clang/lib/Serialization/ModuleFile.cpp b/clang/lib/Serialization/ModuleFile.cpp
index 4858cdbda5545..7f631eafcaf35 100644
--- a/clang/lib/Serialization/ModuleFile.cpp
+++ b/clang/lib/Serialization/ModuleFile.cpp
@@ -65,7 +65,6 @@ LLVM_DUMP_METHOD void ModuleFile::dump() {
 
   llvm::errs() << "  Base macro ID: " << BaseMacroID << '\n'
                << "  Number of macros: " << LocalNumMacros << '\n';
-  dumpLocalRemap("Macro ID local -> global map", MacroRemap);
 
   llvm::errs() << "  Base submodule ID: " << BaseSubmoduleID << '\n'
                << "  Number of submodules: " << LocalNumSubmodules << '\n';
@@ -79,8 +78,6 @@ LLVM_DUMP_METHOD void ModuleFile::dump() {
                << '\n'
                << "  Number of preprocessed entities: "
                << NumPreprocessedEntities << '\n';
-  dumpLocalRemap("Preprocessed entity ID local -> global map",
-                 PreprocessedEntityRemap);
 
   llvm::errs() << "  Base type index: " << BaseTypeIndex << '\n'
                << "  Number of types: " << LocalNumTypes << '\n';
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index e07a208748b77..0022597348a82 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -471,82 +471,13 @@ static bool isSafeToIgnoreCWD(const CowCompilerInvocation &CI) {
   // Check if the command line input uses relative paths.
   // It is not safe to ignore the current working directory if any of the
   // command line inputs use relative paths.
-#define IF_RELATIVE_RETURN_FALSE(PATH)                                         \
-  do {                                                                         \
-    if (!PATH.empty() && !llvm::sys::path::is_absolute(PATH))                  \
-      return false;                                                            \
-  } while (0)
-
-#define IF_ANY_RELATIVE_RETURN_FALSE(PATHS)                                    \
-  do {                                                                         \
-    if (llvm::any_of(PATHS, [](const auto &P) {                                \
-          return !P.empty() && !llvm::sys::path::is_absolute(P);               \
-        }))                                                                    \
-      return false;                                                            \
-  } while (0)
-
-  // Header search paths.
-  const auto &HeaderSearchOpts = CI.getHeaderSearchOpts();
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.Sysroot);
-  for (auto &Entry : HeaderSearchOpts.UserEntries)
-    if (Entry.IgnoreSysRoot)
-      IF_RELATIVE_RETURN_FALSE(Entry.Path);
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ResourceDir);
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ModuleCachePath);
-  IF_RELATIVE_RETURN_FALSE(HeaderSearchOpts.ModuleUserBuildPath);
-  for (auto I = HeaderSearchOpts.PrebuiltModuleFiles.begin(),
-            E = HeaderSearchOpts.PrebuiltModuleFiles.end();
-       I != E;) {
-    auto Current = I++;
-    IF_RELATIVE_RETURN_FALSE(Current->second);
-  }
-  IF_ANY_RELATIVE_RETURN_FALSE(HeaderSearchOpts.PrebuiltModulePaths);
-  IF_ANY_RELATIVE_RETURN_FALSE(HeaderSearchOpts.VFSOverlayFiles);
-
-  // Preprocessor options.
-  const auto &PPOpts = CI.getPreprocessorOpts();
-  IF_ANY_RELATIVE_RETURN_FALSE(PPOpts.MacroIncludes);
-  IF_ANY_RELATIVE_RETURN_FALSE(PPOpts.Includes);
-  IF_RELATIVE_RETURN_FALSE(PPOpts.ImplicitPCHInclude);
-
-  // Frontend options.
-  const auto &FrontendOpts = CI.getFrontendOpts();
-  for (const FrontendInputFile &Input : FrontendOpts.Inputs) {
-    if (Input.isBuffer())
-      continue; // FIXME: Can this happen when parsing command-line?
-
-    IF_RELATIVE_RETURN_FALSE(Input.getFile());
-  }
-  IF_RELATIVE_RETURN_FALSE(FrontendOpts.CodeCompletionAt.FileName);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModuleMapFiles);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModuleFiles);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ModulesEmbedFiles);
-  IF_ANY_RELATIVE_RETURN_FALSE(FrontendOpts.ASTMergeFiles);
-  IF_RELATIVE_RETURN_FALSE(FrontendOpts.OverrideRecordLayoutsFile);
-  IF_RELATIVE_RETURN_FALSE(FrontendOpts.StatsFile);
-
-  // Filesystem options.
-  const auto &FileSystemOpts = CI.getFileSystemOpts();
-  IF_RELATIVE_RETURN_FALSE(FileSystemOpts.WorkingDir);
-
-  // Codegen options.
-  const auto &CodeGenOpts = CI.getCodeGenOpts();
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.DebugCompilationDir);
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.CoverageCompilationDir);
-
-  // Sanitizer options.
-  IF_ANY_RELATIVE_RETURN_FALSE(CI.getLangOpts().NoSanitizeFiles);
-
-  // Coverage mappings.
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.ProfileInstrumentUsePath);
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.SampleProfileFile);
-  IF_RELATIVE_RETURN_FALSE(CodeGenOpts.ProfileRemappingFile);
-
-  // Dependency output options.
-  for (auto &ExtraDep : CI.getDependencyOutputOpts().ExtraDeps)
-    IF_RELATIVE_RETURN_FALSE(ExtraDep.first);
-
-  return true;
+  bool AnyRelative = false;
+  CI.visitPaths([&](StringRef Path) {
+    assert(!AnyRelative && "Continuing path visitation despite returning true");
+    AnyRelative |= !Path.empty() && !llvm::sys::path::is_absolute(Path);
+    return AnyRelative;
+  });
+  return !AnyRelative;
 }
 
 static std::string getModuleContextHash(const ModuleDeps &MD,
diff --git a/clang/test/CIR/CodeGen/address-space-conversion.cpp b/clang/test/CIR/CodeGen/address-space-conversion.cpp
new file mode 100644
index 0000000000000..ca026be60ee71
--- /dev/null
+++ b/clang/test/CIR/CodeGen/address-space-conversion.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+using pi1_t = int __attribute__((address_space(1))) *;
+using pi2_t = int __attribute__((address_space(2))) *;
+
+using ri1_t = int __attribute__((address_space(1))) &;
+using ri2_t = int __attribute__((address_space(2))) &;
+
+// CIR: cir.func dso_local @{{.*test_ptr.*}}
+// LLVM: define dso_local void @{{.*test_ptr.*}}
+// OGCG: define dso_local void @{{.*test_ptr.*}}
+void test_ptr() {
+  pi1_t ptr1;
+  pi2_t ptr2 = (pi2_t)ptr1;
+  // CIR:      %[[#PTR1:]] = cir.load{{.*}} %{{[0-9]+}} : !cir.ptr<!cir.ptr<!s32i, target_address_space(1)>>, !cir.ptr<!s32i, target_address_space(1)>
+  // CIR-NEXT: %[[#CAST:]] = cir.cast address_space %[[#PTR1]] : !cir.ptr<!s32i, target_address_space(1)> -> !cir.ptr<!s32i, target_address_space(2)>
+  // CIR-NEXT: cir.store{{.*}} %[[#CAST]], %{{[0-9]+}} : !cir.ptr<!s32i, target_address_space(2)>, !cir.ptr<!cir.ptr<!s32i, target_address_space(2)>>
+
+  // LLVM:      %[[#PTR1:]] = load ptr addrspace(1), ptr %{{.*}}
+  // LLVM-NEXT: %[[#CAST:]] = addrspacecast ptr addrspace(1) %[[#PTR1]] to ptr addrspace(2)
+  // LLVM-NEXT: store ptr addrspace(2) %[[#CAST]], ptr %{{.*}}
+
+  // OGCG:      %{{.*}} = load ptr addrspace(1), ptr %{{.*}}
+  // OGCG-NEXT: %{{.*}} = addrspacecast ptr addrspace(1) %{{.*}} to ptr addrspace(2)
+  // OGCG-NEXT: store ptr addrspace(2)  %{{.*}}, ptr %{{.*}}
+}
+
+// CIR: cir.func dso_local @{{.*test_ref.*}}
+// LLVM: define dso_local void @{{.*test_ref.*}}
+// OGCG: define dso_local void @{{.*test_ref.*}}
+void test_ref() {
+  pi1_t ptr;
+  ri1_t ref1 = *ptr;
+  ri2_t ref2 = (ri2_t)ref1;
+  // CIR:      %[[#DEREF:]] = cir.load deref{{.*}} %{{[0-9]+}} : !cir.ptr<!cir.ptr<!s32i, target_address_space(1)>>, !cir.ptr<!s32i, target_address_space(1)>
+  // CIR-NEXT: cir.store{{.*}} %[[#DEREF]], %{{[0-9]+}} : !cir.ptr<!s32i, target_address_space(1)>, !cir.ptr<!cir.ptr<!s32i, target_address_space(1)>>
+  // CIR-NEXT: %[[#REF1:]] = cir.load %{{[0-9]+}} : !cir.ptr<!cir.ptr<!s32i, target_address_space(1)>>, !cir.ptr<!s32i, target_address_space(1)>
+  // CIR-NEXT: %[[#CAST:]] = cir.cast address_space %[[#REF1]] : !cir.ptr<!s32i, target_address_space(1)> -> !cir.ptr<!s32i, target_address_space(2)>
+  // CIR-NEXT: cir.store{{.*}} %[[#CAST]], %{{[0-9]+}} : !cir.ptr<!s32i, target_address_space(2)>, !cir.ptr<!cir.ptr<!s32i, target_address_space(2)>>
+
+  // LLVM:      %[[#DEREF:]] = load ptr addrspace(1), ptr %{{.*}}
+  // LLVM-NEXT: store ptr addrspace(1) %[[#DEREF]], ptr %{{.*}}
+  // LLVM-NEXT: %[[#REF1:]] = load ptr addrspace(1), ptr %{{.*}}
+  // LLVM-NEXT: %[[#CAST:]] = addrspacecast ptr addrspace(1) %[[#REF1]] to ptr addrspace(2)
+  // LLVM-NEXT: store ptr addrspace(2) %[[#CAST]], ptr %{{.*}}
+
+  // OGCG:      %{{.*}} = load ptr addrspace(1), ptr %{{.*}}
+  // OGCG-NEXT: store ptr addrspace(1) %{{.*}}, ptr %{{.*}}
+  // OGCG-NEXT: %{{.*}} = load ptr addrspace(1), ptr %{{.*}}
+  // OGCG-NEXT: %{{.*}} = addrspacecast ptr addrspace(1) %{{.*}} to ptr addrspace(2)
+  // OGCG-NEXT: store ptr addrspace(2) %{{.*}}, ptr %{{.*}}
+}
+
+// CIR: cir.func dso_local @{{.*test_nullptr.*}}
+// LLVM: define dso_local void @{{.*test_nullptr.*}}
+// OGCG: define dso_local void @{{.*test_nullptr.*}}
+void test_nullptr() {
+  constexpr pi1_t null1 = nullptr;
+  pi2_t ptr = (pi2_t)null1;
+  // CIR:      %[[#NULL1:]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i, target_address_space(1)>
+  // CIR-NEXT: cir.store{{.*}} %[[#NULL1]], %{{[0-9]+}} : !cir.ptr<!s32i, target_address_space(1)>, !cir.ptr<!cir.ptr<!s32i, target_address_space(1)>>
+  // CIR-NEXT: %[[#NULL2:]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i, target_address_space(2)>
+  // CIR-NEXT: cir.store{{.*}} %[[#NULL2]], %{{[0-9]+}} : !cir.ptr<!s32i, target_address_space(2)>, !cir.ptr<!cir.ptr<!s32i, target_address_space(2)>>
+
+  // LLVM:      store ptr addrspace(1) null, ptr %{{.*}}
+  // LLVM-NEXT: store ptr addrspace(2) null, ptr %{{.*}}
+
+  // OGCG:      store ptr addrspace(1) null, ptr %{{.*}}
+  // OGCG-NEXT: store ptr addrspace(2) null, ptr %{{.*}}
+}
+
+// CIR: cir.func dso_local @{{.*test_side_effect.*}}
+// LLVM: define dso_local void @{{.*test_side_effect.*}}
+// OGCG: define dso_local void @{{.*test_side_effect.*}}
+void test_side_effect(pi1_t b) {
+  pi2_t p = (pi2_t)(*b++, (int*)0);
+  // CIR:      %[[#DEREF:]] = cir.load deref{{.*}} %{{[0-9]+}} : !cir.ptr<!cir.ptr<!s32i, target_address_space(1)>>, !cir.ptr<!s32i, target_address_space(1)>
+  // CIR:      %[[#STRIDE:]] = cir.ptr_stride %[[#DEREF]], %{{[0-9]+}} : (!cir.ptr<!s32i, target_address_space(1)>, !s32i) -> !cir.ptr<!s32i, target_address_space(1)>
+  // CIR:      %[[#NULL:]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i, target_address_space(2)>
+  // CIR-NEXT: cir.store{{.*}} %[[#NULL]], %{{[0-9]+}} : !cir.ptr<!s32i, target_address_space(2)>, !cir.ptr<!cir.ptr<!s32i, target_address_space(2)>>
+
+  // LLVM:      %{{[0-9]+}} = getelementptr {{.*}}i32, ptr addrspace(1) %{{[0-9]+}}, i{{32|64}} 1
+  // LLVM:      store ptr addrspace(2) null, ptr %{{.*}}
+
+  // OGCG:      %{{.*}} = getelementptr{{.*}} i32, ptr addrspace(1) %{{.*}}, i32 1
+  // OGCG:      store ptr addrspace(2) null, ptr %{{.*}}
+}
diff --git a/clang/test/CIR/IR/invalid-addrspace.cir b/clang/test/CIR/IR/invalid-addrspace.cir
index 8f188b840bdec..4b6a388b1e4a8 100644
--- a/clang/test/CIR/IR/invalid-addrspace.cir
+++ b/clang/test/CIR/IR/invalid-addrspace.cir
@@ -24,4 +24,3 @@ cir.func @address_space2(%p : !cir.ptr<!u64i, target_address_space>) {
 cir.func @address_space3(%p : !cir.ptr<!u64i, target_address_space()>) {
   cir.return
 }
-
diff --git a/clang/test/Driver/hip-temps-linux.hip b/clang/test/Driver/hip-temps-linux.hip
index 83a7528dd4560..e4c6282ba6fbd 100644
--- a/clang/test/Driver/hip-temps-linux.hip
+++ b/clang/test/Driver/hip-temps-linux.hip
@@ -1,18 +1,18 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
-// REQUIRES: system-linux
+// UNSUPPORTED: system-windows
 
 // Check no temporary files or directores are left after compilation.
 // RUN: rm -rf %t/mytmp
 // RUN: mkdir -p %t/mytmp
-// RUN: env TMPDIR="%t/mytmp" %clang --target=x86_64-linux-gnu -nogpulib -nogpuinc \
+// RUN: env TMP="%t/mytmp" TMPDIR="%t/mytmp" %clang --target=x86_64-linux-gnu -nogpulib -nogpuinc \
 // RUN:   --rocm-path=%S/Inputs/rocm -nostdinc -nostdlib -c \
 // RUN:   --offload-arch=gfx1030 -emit-llvm -v %s 2>&1 | \
-// RUN:   FileCheck -check-prefixes=CHECK %s
+// RUN:   FileCheck -check-prefixes=CHECK -DOUTPUT_PATH="%t%{fs-sep}mytmp%{fs-sep}" %s
 // RUN: ls %t/mytmp >%t/mytmp.txt 2>&1
 // RUN: touch %t/empty.txt
 // RUN: diff %t/mytmp.txt %t/empty.txt
 
-// CHECK: -o {{.*}}/mytmp/hip-temps-linux-gfx1030-{{.*}}.bc
+// CHECK: -o {{"?}}[[OUTPUT_PATH]]hip-temps-linux-gfx1030-{{.*}}.bc{{"?}}
 
 int main() {}
diff --git a/clang/test/Modules/no-transitive-macro-change.cpp b/clang/test/Modules/no-transitive-macro-change.cpp
new file mode 100644
index 0000000000000..fced26490c27f
--- /dev/null
+++ b/clang/test/Modules/no-transitive-macro-change.cpp
@@ -0,0 +1,23 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header \
+// RUN:  %t/a.h -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header \
+// RUN:  %t/b.h -o %t/b.pcm -fmodule-file=%t/a.pcm
+// RUN: echo "#define A2 44" >> %t/a.h
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header \
+// RUN:  %t/a.h -o %t/a.v1.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header \
+// RUN:  %t/b.h -o %t/b.v1.pcm -fmodule-file=%t/a.v1.pcm
+// RUN: not diff %t/b.pcm %t/b.v1.pcm &> /dev/null
+
+//--- a.h
+#pragma once
+#define A 43
+
+//--- b.h
+#pragma once
+import "a.h";
+#define B 43
+const int a = A;
diff --git a/clang/test/Sema/warn-lifetime-safety.cpp b/clang/test/Sema/warn-lifetime-safety.cpp
index 3460a8675bf04..b9368db550805 100644
--- a/clang/test/Sema/warn-lifetime-safety.cpp
+++ b/clang/test/Sema/warn-lifetime-safety.cpp
@@ -655,3 +655,34 @@ void conditional_operator_lifetimebound_nested_deep(bool cond) {
   }  // expected-note 4 {{destroyed here}}
   (void)*p;  // expected-note 4 {{later used here}}
 }
+
+void parentheses(bool cond) {
+  MyObj* p;
+  {
+    MyObj a;
+    p = &((((a))));  // expected-warning {{object whose reference is captured does not live long enough}}
+  }                  // expected-note {{destroyed here}}
+  (void)*p;          // expected-note {{later used here}}
+
+  {
+    MyObj a;
+    p = ((GetPointer((a))));  // expected-warning {{object whose reference is captured does not live long enough}}
+  }                           // expected-note {{destroyed here}}
+  (void)*p;                   // expected-note {{later used here}}
+
+  {
+    MyObj a, b, c, d;
+    p = &(cond ? (cond ? a     // expected-warning {{object whose reference is captured does not live long enough}}.
+                       : b)    // expected-warning {{object whose reference is captured does not live long enough}}.
+               : (cond ? c     // expected-warning {{object whose reference is captured does not live long enough}}.
+                       : d));  // expected-warning {{object whose reference is captured does not live long enough}}.
+  }  // expected-note 4 {{destroyed here}}
+  (void)*p;  // expected-note 4 {{later used here}}
+
+  {
+    MyObj a, b, c, d;
+    p = ((cond ? (((cond ? &a : &b)))   // expected-warning 2 {{object whose reference is captured does not live long enough}}.
+              : &(((cond ? c : d)))));  // expected-warning 2 {{object whose reference is captured does not live long enough}}.
+  }  // expected-note 4 {{destroyed here}}
+  (void)*p;  // expected-note 4 {{later used here}}
+}
diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
index 9d61d56e078e3..601308c53f9a9 100644
--- a/clang/unittests/Analysis/LifetimeSafetyTest.cpp
+++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp
@@ -700,6 +700,23 @@ TEST_F(LifetimeAnalysisTest, GslPointerInConditionalOperator) {
   EXPECT_THAT(Origin("v"), HasLoansTo({"a", "b"}, "p1"));
 }
 
+TEST_F(LifetimeAnalysisTest, ExtraParenthesis) {
+  SetupTest(R"(
+    void target() {
+      MyObj a;
+      View x = ((View((((a))))));
+      View y = ((View{(((x)))}));
+      View z = ((View(((y)))));
+      View p = ((View{((x))}));
+      POINT(p1);
+    }
+  )");
+  EXPECT_THAT(Origin("x"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("y"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("z"), HasLoansTo({"a"}, "p1"));
+  EXPECT_THAT(Origin("p"), HasLoansTo({"a"}, "p1"));
+}
+
 // FIXME: Handle temporaries.
 TEST_F(LifetimeAnalysisTest, ViewFromTemporary) {
   SetupTest(R"(
diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 246a22c56c31a..856872425117a 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -646,6 +646,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xC033:  // 33 C0 : xor eax, eax
     case 0xC933:  // 33 C9 : xor ecx, ecx
     case 0xD233:  // 33 D2 : xor edx, edx
+    case 0xFF33:  // 33 FF : xor edi, edi
     case 0x9066:  // 66 90 : xchg %ax,%ax (Two-byte NOP)
     case 0xDB84:  // 84 DB : test bl,bl
     case 0xC084:  // 84 C0 : test al,al
@@ -764,6 +765,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
 
   switch (0x00FFFFFF & *(u32 *)address) {
     case 0x10b70f:    // 0f b7 10 : movzx edx, WORD PTR [rax]
+    case 0x02b70f:    // 0f b7 02 : movzx eax, WORD PTR [rdx]
     case 0xc00b4d:    // 4d 0b c0 : or r8, r8
     case 0xc03345:    // 45 33 c0 : xor r8d, r8d
     case 0xc08548:    // 48 85 c0 : test rax, rax
@@ -799,6 +801,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xc9854d:    // 4d 85 c9 : test r9, r9
     case 0xc98b4c:    // 4c 8b c9 : mov r9, rcx
     case 0xd12948:    // 48 29 d1 : sub rcx, rdx
+    case 0xc22b4c:    // 4c 2b c2 : sub r8, rdx
     case 0xca2b48:    // 48 2b ca : sub rcx, rdx
     case 0xca3b48:    // 48 3b ca : cmp rcx, rdx
     case 0xd12b48:    // 48 2b d1 : sub rdx, rcx
@@ -813,6 +816,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xd9f748:    // 48 f7 d9 : neg rcx
     case 0xc03145:    // 45 31 c0 : xor r8d,r8d
     case 0xc93145:    // 45 31 c9 : xor r9d,r9d
+    case 0xd23345:    // 45 33 d2 : xor r10d, r10d
     case 0xdb3345:    // 45 33 db : xor r11d, r11d
     case 0xc08445:    // 45 84 c0 : test r8b,r8b
     case 0xd28445:    // 45 84 d2 : test r10b,r10b
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index f11c1d1458556..e3dc4cfbe9b2c 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -841,6 +841,7 @@ const struct InstructionSizeData {
     { 1, {0xCC}, 0, "CC : int 3  i.e. registering weak functions)"},
     { 2, {0x31, 0xC0}, 0, "31 C0 : xor eax, eax"},
     { 2, {0x31, 0xC9}, 0, "31 C9 : xor ecx, ecx"},
+    { 2, {0x33, 0xFF}, 0, "33 FF : xor edi, edi"},
     { 2, {0x31, 0xD2}, 0, "31 D2 : xor edx, edx"},
     { 2, {0x33, 0xC0}, 0, "33 C0 : xor eax, eax"},
     { 2, {0x33, 0xC9}, 0, "33 C9 : xor ecx, ecx"},
@@ -895,6 +896,7 @@ const struct InstructionSizeData {
     { 3, {0x0f, 0xb6, 0x11}, 0, "0f b6 11 : movzx edx, BYTE PTR [rcx]"},
     { 3, {0x0f, 0xb6, 0xc2}, 0, "0f b6 c2 : movzx eax, dl"},
     { 3, {0x0f, 0xb6, 0xd2}, 0, "0f b6 d2 : movzx edx, dl"},
+    { 3, {0x0f, 0xb7, 0x02}, 0, "0f b7 02 : movzx eax, WORD PTR [rdx]"},
     { 3, {0x0f, 0xb7, 0x10}, 0, "0f b7 10 : movzx edx, WORD PTR [rax]"},
     { 3, {0x0f, 0xbe, 0xd2}, 0, "0f be d2 : movsx edx, dl"},
     { 3, {0x41, 0x8b, 0xc0}, 0, "41 8b c0 : mov eax, r8d"},
@@ -906,6 +908,7 @@ const struct InstructionSizeData {
     { 3, {0x45, 0x31, 0xc9}, 0, "45 31 c9 : xor r9d,r9d"},
     { 3, {0x45, 0x33, 0xc0}, 0, "45 33 c0 : xor r8d, r8d"},
     { 3, {0x45, 0x33, 0xc9}, 0, "45 33 c9 : xor r9d, r9d"},
+    { 3, {0x45, 0x33, 0xd2}, 0, "45 33 d2 : xor r10d, r10d"},
     { 3, {0x45, 0x33, 0xdb}, 0, "45 33 db : xor r11d, r11d"},
     { 3, {0x45, 0x84, 0xc0}, 0, "45 84 c0 : test r8b,r8b"},
     { 3, {0x45, 0x84, 0xd2}, 0, "45 84 d2 : test r10b,r10b"},
@@ -950,6 +953,7 @@ const struct InstructionSizeData {
     { 3, {0x49, 0xff, 0xc5}, 0, "49 ff c5 : inc r13"},
     { 3, {0x49, 0xff, 0xc6}, 0, "49 ff c6 : inc r14"},
     { 3, {0x49, 0xff, 0xc7}, 0, "49 ff c7 : inc r15"},
+    { 3, {0x4c, 0x2b, 0xc2}, 0, "4c 2b c2 : sub r8, rdx"},
     { 3, {0x4c, 0x8b, 0xc1}, 0, "4c 8b c1 : mov r8, rcx"},
     { 3, {0x4c, 0x8b, 0xc9}, 0, "4c 8b c9 : mov r9, rcx"},
     { 3, {0x4c, 0x8b, 0xd1}, 0, "4c 8b d1 : mov r10, rcx"},
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 9d2f02189b8bd..3f7dd8e402b78 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -195,16 +195,14 @@ def push_dynamic_library_lookup_path(config, new_path):
 # Normalize the path for comparison
 if test_cc_resource_dir is not None:
     test_cc_resource_dir = os.path.realpath(test_cc_resource_dir)
-if lit_config.debug:
-    lit_config.note(f"Resource dir for {config.clang} is {test_cc_resource_dir}")
+lit_config.dbg(f"Resource dir for {config.clang} is {test_cc_resource_dir}")
 local_build_resource_dir = os.path.realpath(config.compiler_rt_output_dir)
 if test_cc_resource_dir != local_build_resource_dir and config.test_standalone_build_libs:
     if config.compiler_id == "Clang":
-        if lit_config.debug:
-            lit_config.note(
-                f"Overriding test compiler resource dir to use "
-                f'libraries in "{config.compiler_rt_libdir}"'
-            )
+        lit_config.dbg(
+            f"Overriding test compiler resource dir to use "
+            f'libraries in "{config.compiler_rt_libdir}"'
+        )
         # Ensure that we use the just-built static libraries when linking by
         # overriding the Clang resource directory. Additionally, we want to use
         # the builtin headers shipped with clang (e.g. stdint.h), so we
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 3197b3d7fd01b..54e41ece5f4d9 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -83,7 +83,7 @@ add_unittest_framework_library(
 )
 
 set(libc_death_test_srcs LibcDeathTestExecutors.cpp)
-if(${LIBC_TARGET_OS} STREQUAL "linux")
+if(${LIBC_TARGET_OS} STREQUAL "linux" OR ${LIBC_TARGET_OS} STREQUAL "darwin")
   list(APPEND libc_death_test_srcs ExecuteFunctionUnix.cpp)
 endif()
 
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
index 14a09b1f09f5c..75561430b33ad 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc
@@ -21,47 +21,50 @@
 
 #ifdef __CLC_HAS_ATOMIC
 
-#ifndef __CLC_PTR_CASTTYPE
-#define __CLC_PTR_CASTTYPE __CLC_GENTYPE
+#ifndef __CLC_CASTTYPE
+#define __CLC_CASTTYPE __CLC_GENTYPE
 #endif
 
 #ifndef __CLC_AS_RETTYPE
 #define __CLC_AS_RETTYPE(x) x
 #endif
 
+#ifndef __CLC_AS_CASTTYPE
+#define __CLC_AS_CASTTYPE(x) x
+#endif
+
 #ifdef __CLC_NO_VALUE_ARG
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(                         \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
       int MemoryScope) {                                                       \
     return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION(                               \
-        (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope));       \
+        (ADDRSPACE __CLC_CASTTYPE *)Ptr, MemoryOrder, MemoryScope));           \
   }
 #elif defined(__CLC_INC_DEC)
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(                         \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder,                  \
       int MemoryScope) {                                                       \
-    return __CLC_AS_RETTYPE(                                                   \
-        __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr,               \
-                            (__CLC_GENTYPE)1, MemoryOrder, MemoryScope));      \
+    return __CLC_IMPL_FUNCTION(Ptr, (__CLC_GENTYPE)1, MemoryOrder,             \
+                               MemoryScope);                                   \
   }
 #elif defined(__CLC_RETURN_VOID)
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF void __CLC_FUNCTION(                                  \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
       int MemoryOrder, int MemoryScope) {                                      \
-    __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value,            \
-                        MemoryOrder, MemoryScope);                             \
+    __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_CASTTYPE *)Ptr,                       \
+                        __CLC_AS_CASTTYPE(Value), MemoryOrder, MemoryScope);   \
   }
 #else
 #define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
   _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(                         \
       volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value,              \
       int MemoryOrder, int MemoryScope) {                                      \
-    return __CLC_AS_RETTYPE(                                                   \
-        __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value,        \
-                            MemoryOrder, MemoryScope));                        \
+    return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION(                               \
+        (ADDRSPACE __CLC_CASTTYPE *)Ptr, __CLC_AS_CASTTYPE(Value),             \
+        MemoryOrder, MemoryScope));                                            \
   }
 #endif
 
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
index ee80256d3dbb6..b2c26758103cd 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl
@@ -14,10 +14,12 @@
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/integer/gentype.inc>
 
-#undef __CLC_PTR_CASTTYPE
+#undef __CLC_CASTTYPE
 #undef __CLC_AS_RETTYPE
-#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#undef __CLC_AS_CASTTYPE
+#define __CLC_CASTTYPE __CLC_BIT_INTN
 #define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
+#define __CLC_AS_CASTTYPE __CLC_AS_S_GENTYPE
 
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
index f7fe2510569e4..af808553a7110 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl
@@ -15,9 +15,9 @@
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/integer/gentype.inc>
 
-#undef __CLC_PTR_CASTTYPE
+#undef __CLC_CASTTYPE
 #undef __CLC_AS_RETTYPE
-#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#define __CLC_CASTTYPE __CLC_BIT_INTN
 #define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x)
 
 #define __CLC_BODY <clc_atomic_def.inc>
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
index a93d21e8430ce..66ae2ba98556d 100644
--- a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl
@@ -15,8 +15,10 @@
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/integer/gentype.inc>
 
-#undef __CLC_PTR_CASTTYPE
-#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN
+#undef __CLC_CASTTYPE
+#undef __CLC_AS_CASTTYPE
+#define __CLC_CASTTYPE __CLC_BIT_INTN
+#define __CLC_AS_CASTTYPE __CLC_AS_S_GENTYPE
 
 #define __CLC_BODY <clc_atomic_def.inc>
 #include <clc/math/gentype.inc>
diff --git a/libcxx/test/selftest/dsl/dsl.sh.py b/libcxx/test/selftest/dsl/dsl.sh.py
index 93f351f58eb4b..b8ee2ca3d6bb9 100644
--- a/libcxx/test/selftest/dsl/dsl.sh.py
+++ b/libcxx/test/selftest/dsl/dsl.sh.py
@@ -61,7 +61,7 @@ def setUp(self):
         self.litConfig = lit.LitConfig.LitConfig(
             progname="lit",
             path=[],
-            quiet=False,
+            diagnostic_level="note",
             useValgrind=False,
             valgrindLeakCheck=False,
             valgrindArgs=[],
diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py
index 0840c46d7bfae..00fab6a73ba68 100644
--- a/libcxx/utils/libcxx/test/config.py
+++ b/libcxx/utils/libcxx/test/config.py
@@ -22,6 +22,7 @@ def _appendToSubstitution(substitutions, key, value):
 
 def configure(parameters, features, config, lit_config):
     note = lambda s: lit_config.note("({}) {}".format(config.name, s))
+    debug = lambda s: lit_config.dbg("({}) {}".format(config.name, s))
     config.environment = dict(os.environ)
 
     # Apply the actions supplied by parameters to the configuration first, since
@@ -31,25 +32,23 @@ def configure(parameters, features, config, lit_config):
         actions = param.getActions(config, lit_config.params)
         for action in actions:
             action.applyTo(config)
-            if lit_config.debug:
-                note(
-                    "Applied '{}' as a result of parameter '{}'".format(
-                        action.pretty(config, lit_config.params),
-                        param.pretty(config, lit_config.params),
-                    )
+            debug(
+                "Applied '{}' as a result of parameter '{}'".format(
+                    action.pretty(config, lit_config.params),
+                    param.pretty(config, lit_config.params),
                 )
+            )
 
     # Then, apply the automatically-detected features.
     for feature in features:
         actions = feature.getActions(config)
         for action in actions:
             action.applyTo(config)
-            if lit_config.debug:
-                note(
-                    "Applied '{}' as a result of implicitly detected feature '{}'".format(
-                        action.pretty(config, lit_config.params), feature.pretty(config)
-                    )
+            debug(
+                "Applied '{}' as a result of implicitly detected feature '{}'".format(
+                    action.pretty(config, lit_config.params), feature.pretty(config)
                 )
+            )
 
     # Print the basic substitutions
     for sub in ("%{cxx}", "%{flags}", "%{compile_flags}", "%{link_flags}", "%{benchmark_flags}", "%{exec}"):
diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py
index 3fb30d82e0d24..88fc49160c56b 100644
--- a/libcxx/utils/libcxx/test/dsl.py
+++ b/libcxx/utils/libcxx/test/dsl.py
@@ -88,7 +88,7 @@ def _executeWithFakeConfig(test, commands):
     litConfig = lit.LitConfig.LitConfig(
         progname="lit",
         path=[],
-        quiet=False,
+        diagnostic_level="note",
         useValgrind=False,
         valgrindLeakCheck=False,
         valgrindArgs=[],
diff --git a/lldb/source/Commands/CMakeLists.txt b/lldb/source/Commands/CMakeLists.txt
index 69e4c45f0b8e5..33332f2d59a23 100644
--- a/lldb/source/Commands/CMakeLists.txt
+++ b/lldb/source/Commands/CMakeLists.txt
@@ -58,6 +58,8 @@ add_lldb_library(lldbCommands NO_PLUGIN_DEPENDENCIES
     lldbUtility
     lldbValueObject
     lldbVersion
+  CLANG_LIBS
+    clangFrontend
   )
 
 add_dependencies(lldbCommands LLDBOptionsGen)
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index f27a95de484df..815cc9dada2c1 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -52,9 +52,6 @@
 #include "lldb/Host/windows/PosixApi.h"
 #endif
 
-#include "Plugins/Language/CPlusPlus/CPlusPlusLanguage.h"
-#include "Plugins/Language/ObjC/ObjCLanguage.h"
-
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DJB.h"
diff --git a/lldb/source/ValueObject/CMakeLists.txt b/lldb/source/ValueObject/CMakeLists.txt
index 2a61407521bec..f0fe7f374a506 100644
--- a/lldb/source/ValueObject/CMakeLists.txt
+++ b/lldb/source/ValueObject/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_lldb_library(lldbValueObject
+add_lldb_library(lldbValueObject NO_PLUGIN_DEPENDENCIES
   DILAST.cpp
   DILEval.cpp
   DILLexer.cpp
@@ -34,6 +34,4 @@ add_lldb_library(lldbValueObject
     lldbSymbol
     lldbTarget
     lldbUtility
-    lldbPluginCPlusPlusLanguage
-    lldbPluginObjCLanguage
   )
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3418df1dede11..22075c8e73b3f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1017,6 +1017,15 @@ supported for the ``amdgcn`` target.
   `ptr addrspace(7)` directly, which produces a buffer fat pointer with an initial
   offset of 0 and prevents the address space cast from being rewritten away.
 
+  The ``align`` attribute on operations from buffer fat pointers is deemed to apply
+  to all componenents of the pointer - that is, an ``align 4`` load is expected to
+  both have the offset be a multiple of 4 and to have a base pointer with an
+  alignment of 4.
+
+  This componentwise definition of alignment is needed to allow for promotion of
+  aligned loads to ``s_buffer_load``, which requires that both the base pointer and
+  offset be appropriately aligned.
+
 **Buffer Resource**
   The buffer resource pointer, in address space 8, is the newer form
   for representing buffer descriptors in AMDGPU IR, replacing their
@@ -1040,6 +1049,25 @@ supported for the ``amdgcn`` target.
   (bits `127:96`). The specific interpretation of these fields varies by the
   target architecture and is detailed in the ISA descriptions.
 
+  When buffer resources are passed to buffer intrinsics such as
+  ``llvm.amdgcn.raw.ptr.buffer.load`` or
+  ``llvm.amdgcn.struct.ptr.buffer.store``, the ``align`` attribute on the
+  pointer is assumed to apply to both the offset and the base pointer value.
+  That is, ``align 8`` means that both the base address within the ``ptr
+  addrspace(8)`` and the ``offset`` argument have their three lowest bits set
+  to 0. If the stride of the resource is nonzero, the stride must be a multiple
+  of the given alignment.
+
+  In other words, the ``align`` attribute specifies the alignment of the effective
+  address being loaded from/stored to *and* acts as a guarantee that this is
+  not achieved from adding lower-alignment parts (as hardware may not always
+  allow for such an addition). For example, if a buffer resource has the base
+  address ``0xfffe`` and is accessed with a ``raw.ptr.buffer.load`` with an offset
+  of ``2``, the load must **not** be marked ``align 4`` (even though the
+  effective adddress ``0x10000`` is so aligned) as this would permit the compiler
+  to make incorrect transformations (such as promotion to ``s_buffer_load``,
+  which requires such componentwise alignment).
+
 **Buffer Strided Pointer**
   The buffer index pointer is an experimental address space. It represents
   a 128-bit buffer descriptor and a 32-bit offset, like the **Buffer Fat
@@ -1058,6 +1086,12 @@ supported for the ``amdgcn`` target.
   index and offset values are both 0. This prevents the address space cast from
   being rewritten away.
 
+  As with buffer fat pointers, alignment of a buffer strided pointer applies to
+  both the base pointer address and the offset. In addition, the alignment also
+  constrains the stride of the pointer. That is, if you do an ``align 4`` load from
+  a buffer strided pointer, this means that the base pointer is ``align(4)``, that
+  the offset is a multiple of 4 bytes, and that the stride is a multiple of 4.
+
 **Streamout Registers**
   Dedicated registers used by the GS NGG Streamout Instructions. The register
   file is modelled as a memory in a distinct address space because it is indexed
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index e57ed24a45065..2ebccee6aa68c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -628,10 +628,15 @@ void CodeViewDebug::beginModule(Module *M) {
     // When emitting only compiler information, we may have only NoDebug CUs,
     // which would be skipped by debug_compile_units_begin.
     NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+    if (CUs->operands().empty()) {
+      Asm = nullptr;
+      return;
+    }
     Node = *CUs->operands().begin();
   }
-  const auto *CU = cast<DICompileUnit>(Node);
-  DISourceLanguageName Lang = CU->getSourceLanguage();
+
+  TheCU = cast<DICompileUnit>(Node);
+  DISourceLanguageName Lang = TheCU->getSourceLanguage();
   CurrentSourceLanguage =
       Lang.hasVersionedName()
           ? MapDWARFLanguageToCVLang(
@@ -639,7 +644,7 @@ void CodeViewDebug::beginModule(Module *M) {
           : MapDWARFLanguageToCVLang(
                 static_cast<dwarf::SourceLanguage>(Lang.getName()));
   if (!M->getCodeViewFlag() ||
-      CU->getEmissionKind() == DICompileUnit::NoDebug) {
+      TheCU->getEmissionKind() == DICompileUnit::NoDebug) {
     Asm = nullptr;
     return;
   }
@@ -900,11 +905,10 @@ void CodeViewDebug::emitCompilerInformation() {
   OS.AddComment("CPUType");
   OS.emitInt16(static_cast<uint64_t>(TheCPU));
 
-  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
-  const MDNode *Node = *CUs->operands().begin();
-  const auto *CU = cast<DICompileUnit>(Node);
+  StringRef CompilerVersion = "0";
+  if (TheCU)
+    CompilerVersion = TheCU->getProducer();
 
-  StringRef CompilerVersion = CU->getProducer();
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
   for (int N : FrontVer.Part) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index c2b878e52e1c3..7fd2cec8c74f2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -98,6 +98,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// The codeview CPU type used by the translation unit.
   codeview::CPUType TheCPU;
 
+  const DICompileUnit *TheCU = nullptr;
+
   /// The AsmPrinter used for emitting compiler metadata. When only compiler
   /// info is being emitted, DebugHandlerBase::Asm may be null.
   AsmPrinter *CompilerInfoAsm = nullptr;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5160cc999d63b..d78c4c707d168 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -2218,7 +2218,7 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
   let Inst{23-16} = LdScaleOp;
   let Inst{40-32} = scale_src0;
   let Inst{49-41} = scale_src1;
-  let Inst{58-50} = 0; // scale src2
+  let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
   let Inst{59}    = matrix_b_scale{0}; // scale_op_sel_hi(0)
   let Inst{60}    = 0;                 // scale_op_sel_hi(1)
   let Inst{63-61} = {0, matrix_a_scale_fmt{1-0}}; // neg (lo)
@@ -2433,6 +2433,15 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
                           string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
   VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>;
 
+multiclass VOP3P_Real_LD_SCALE_gfx1250<bits<8> op> {
+  defvar ps = !cast<VOP3P_Pseudo>(NAME);
+  def _gfx1250 :
+    VOP3P_Real_Gen<ps, GFX1250Gen, ps.Mnemonic>,
+    VOP3Pe_gfx11_gfx12<op, ps.Pfl> {
+      let Inst{58-50} = 0x100; // scale src2 = vgpr0 (dummy)
+    }
+}
+
 defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
 defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
 
@@ -2462,8 +2471,8 @@ defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple<GFX1250Gen, 0x3d>;
 defm V_FMA_MIXLO_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3e>;
 defm V_FMA_MIXHI_BF16   : VOP3P_Realtriple<GFX1250Gen, 0x3f>;
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3P_Real_gfx1250<0x35>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_gfx1250<0x3a>;
+defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3P_Real_LD_SCALE_gfx1250<0x35>;
+defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3P_Real_LD_SCALE_gfx1250<0x3a>;
 
 let AssemblerPredicate = isGFX1250Plus in
 def : AMDGPUMnemonicAlias<"v_fma_mix_f32_f16",  "v_fma_mix_f32">;
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 21a113572ce93..c19eed122fe63 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2536,7 +2536,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   (void)TRI;  // Silence unused warning in non-assert builds.
-  Register FramePtr = RegInfo->getFrameRegister(MF);
+  Register FramePtr = STI.getFramePointerReg();
   ARMSubtarget::PushPopSplitVariation PushPopSplit =
       STI.getPushPopSplitVariation(MF);
 
@@ -2783,7 +2783,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
 
-    if (HasFP) {
+    // Save the FP if:
+    // 1. We currently need it (HasFP), OR
+    // 2. We might need it later due to stack realignment from aligned DPRCS2
+    //    saves (which will make hasFP() become true in emitPrologue).
+    if (HasFP || (isFPReserved(MF) && AFI->getNumAlignedDPRCS2Regs() > 0)) {
       SavedRegs.set(FramePtr);
       // If the frame pointer is required by the ABI, also spill LR so that we
       // emit a complete frame record.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 92fae71121a81..f28640ce7b107 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2510,9 +2510,44 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
       Chain = DAG.getStackArgumentTokenFactor(Chain);
-      if (ByValTempChain)
+      if (ByValTempChain) {
+        // In case of large byval copies, re-using the stackframe for tail-calls
+        // can lead to overwriting incoming arguments on the stack. Force
+        // loading these stack arguments before the copy to avoid that.
+        SmallVector<SDValue, 8> IncomingLoad;
+        for (unsigned I = 0; I < OutVals.size(); ++I) {
+          if (Outs[I].Flags.isByVal())
+            continue;
+
+          SDValue OutVal = OutVals[I];
+          LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
+          if (!OutLN)
+            continue;
+
+          FrameIndexSDNode *FIN =
+              dyn_cast_or_null<FrameIndexSDNode>(OutLN->getBasePtr());
+          if (!FIN)
+            continue;
+
+          if (!MFI.isFixedObjectIndex(FIN->getIndex()))
+            continue;
+
+          for (const CCValAssign &VA : ArgLocs) {
+            if (VA.isMemLoc())
+              IncomingLoad.push_back(OutVal.getValue(1));
+          }
+        }
+
+        // Update the chain to force loads for potentially clobbered argument
+        // loads to happen before the byval copy.
+        if (!IncomingLoad.empty()) {
+          IncomingLoad.push_back(Chain);
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
+        }
+
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
                             ByValTempChain);
+      }
       AfterFormalArgLoads = true;
     }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 1cbedb7d141e2..1024e55f912c7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2691,6 +2691,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case ISD::SCALAR_TO_VECTOR:
+    if (Subtarget->enablePExtCodeGen()) {
+      MVT SrcVT = Node->getOperand(0).getSimpleValueType();
+      if (VT == MVT::v2i32 && SrcVT == MVT::i64) {
+        ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
+        CurDAG->RemoveDeadNode(Node);
+        return;
+      }
+    }
+    break;
   case ISD::INSERT_SUBVECTOR:
   case RISCVISD::TUPLE_INSERT: {
     SDValue V = Node->getOperand(0);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5a081d54d0726..d086a2a4a3057 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -516,8 +516,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
       setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
       setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
-      setOperationAction(ISD::LOAD, MVT::v2i16, Custom);
-      setOperationAction(ISD::LOAD, MVT::v4i8, Custom);
     } else {
       VTs.append({MVT::v2i16, MVT::v4i8});
     }
@@ -14757,21 +14755,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) {
-      SDLoc DL(N);
-      SDValue ExtLoad =
-          DAG.getExtLoad(ISD::SEXTLOAD, DL, MVT::i64, Ld->getChain(),
-                         Ld->getBasePtr(), MVT::i32, Ld->getMemOperand());
-      if (N->getValueType(0) == MVT::v2i16) {
-        Results.push_back(DAG.getBitcast(MVT::v4i16, ExtLoad));
-        Results.push_back(ExtLoad.getValue(1));
-      } else if (N->getValueType(0) == MVT::v4i8) {
-        Results.push_back(DAG.getBitcast(MVT::v8i8, ExtLoad));
-        Results.push_back(ExtLoad.getValue(1));
-      }
-      return;
-    }
-
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b9d4ff41c0755..835b0995cc4fc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7633,6 +7633,14 @@ createWidenInductionRecipes(VPInstruction *PhiR,
 
   VPValue *Step =
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep());
+
+  // Update wide induction increments to use the same step as the corresponding
+  // wide induction. This enables detecting induction increments directly in
+  // VPlan and removes redundant splats.
+  using namespace llvm::VPlanPatternMatch;
+  if (match(PhiR->getOperand(1), m_Add(m_Specific(PhiR), m_VPValue())))
+    PhiR->getOperand(1)->getDefiningRecipe()->setOperand(1, Step);
+
   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingInstr());
   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
                                            IndDesc, PhiR->getDebugLoc());
@@ -8473,20 +8481,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
          "entry block must be set to a VPRegionBlock having a non-empty entry "
          "VPBasicBlock");
 
-  // Update wide induction increments to use the same step as the corresponding
-  // wide induction. This enables detecting induction increments directly in
-  // VPlan and removes redundant splats.
-  for (const auto &[Phi, ID] : Legal->getInductionVars()) {
-    auto *IVInc = cast<Instruction>(
-        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)
-      continue;
-    VPWidenInductionRecipe *WideIV =
-        cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
-    VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);
-    R->setOperand(1, WideIV->getStepValue());
-  }
-
   // TODO: We can't call runPass on these transforms yet, due to verifier
   // failures.
   VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range);
@@ -8627,6 +8621,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
   using namespace VPlanPatternMatch;
+  VPTypeAnalysis TypeInfo(*Plan);
   VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
   VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
@@ -8711,8 +8706,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
         VecOp = FMulRecipe;
       } else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
-                 CurrentLinkI->getOpcode() == Instruction::Sub) {
-        Type *PhiTy = PhiR->getUnderlyingValue()->getType();
+                 match(CurrentLink, m_Sub(m_VPValue(), m_VPValue()))) {
+        Type *PhiTy = TypeInfo.inferScalarType(PhiR);
         auto *Zero = Plan->getConstantInt(PhiTy, 0);
         VPWidenRecipe *Sub = new VPWidenRecipe(
             Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
@@ -8788,7 +8783,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
 
     const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
         cast<PHINode>(PhiR->getUnderlyingInstr()));
-    Type *PhiTy = PhiR->getUnderlyingValue()->getType();
+    Type *PhiTy = TypeInfo.inferScalarType(PhiR);
     // If tail is folded by masking, introduce selects between the phi
     // and the users outside the vector region of each reduction, at the
     // beginning of the dedicated latch block.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index aa2785252d376..f34c99b84b1aa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -496,6 +496,12 @@ m_c_Binary(const Op0_t &Op0, const Op1_t &Op1) {
   return AllRecipe_commutative_match<Opcode, Op0_t, Op1_t>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline AllRecipe_match<Instruction::Add, Op0_t, Op1_t> m_Add(const Op0_t &Op0,
+                                                             const Op1_t &Op1) {
+  return m_Binary<Instruction::Add, Op0_t, Op1_t>(Op0, Op1);
+}
+
 template <typename Op0_t, typename Op1_t>
 inline AllRecipe_commutative_match<Instruction::Add, Op0_t, Op1_t>
 m_c_Add(const Op0_t &Op0, const Op1_t &Op1) {
diff --git a/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll b/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll
new file mode 100644
index 0000000000000..50c676c425ce7
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/byval_struct_copy_tailcall.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mtriple thumbv7em-apple-darwin -o - < %s | FileCheck %s
+
+%"struct.s1" = type { [19 x i32] }
+
+define void @f0(ptr byval(%"struct.s1") %0, ptr %1) #1 {
+; CHECK-LABEL: _f0:                                    @ @f0
+; CHECK-NEXT:  @ %bb.0:
+; CHECK-NEXT:  	sub	sp, #16
+; CHECK-NEXT:  	push	{r4, lr}
+; CHECK-NEXT:  	sub	sp, #76
+; CHECK-NEXT:  	add.w	r9, sp, #84
+; CHECK-NEXT:  	stm.w	r9, {r0, r1, r2, r3}
+; CHECK-NEXT:  	mov	r0, sp
+; CHECK-NEXT:  	add	r1, sp, #84
+; CHECK-NEXT:  	movs	r2, #76
+; CHECK-NEXT:  	mov	r3, r0
+; CHECK-NEXT:  LBB0_1:                                 @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:  	ldr	r4, [r1], #4
+; CHECK-NEXT:  	subs	r2, #4
+; CHECK-NEXT:  	str	r4, [r3], #4
+; CHECK-NEXT:  	bne	LBB0_1
+; CHECK-NEXT:  @ %bb.2:
+; CHECK-NEXT:  	add.w	r1, r0, #12
+; CHECK-NEXT:  	add	r2, sp, #100
+; CHECK-NEXT:  	ldr	r0, [sp, #160]
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldr	r3, [r1], #4
+; CHECK-NEXT:  	str	r3, [r2], #4
+; CHECK-NEXT:  	ldm.w	sp, {r1, r2, r3}
+; CHECK-NEXT:  	add	sp, #76
+; CHECK-NEXT:  	pop.w	{r4, lr}
+; CHECK-NEXT:  	add	sp, #16
+; CHECK-NEXT:  	b.w	_f1
+  tail call  void @f1(ptr %1, ptr byval(%"struct.s1") %0)
+  ret void
+}
+
+declare void @f1(ptr, ptr)
+
+attributes #1 = { nounwind "frame-pointes"="non-leaf" }
diff --git a/llvm/test/CodeGen/ARM/save-fp-with-non-leaf.ll b/llvm/test/CodeGen/ARM/save-fp-with-non-leaf.ll
new file mode 100644
index 0000000000000..fefa5a0a68020
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/save-fp-with-non-leaf.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
+target datalayout = "e-m:o-p:32:32-Fi8-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-darwin"
+
+; This test checks that even with NEON register induced stack re-alignment, and
+; with the frame-pointer=non-leaf option, that we still save fp aka r7 in the
+; prolog as required.
+
+define fastcc i32 @test_save_fp() #0 {
+; CHECK-LABEL: test_save_fp:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r7, lr}
+; CHECK-NEXT:    add r7, sp, #4
+; CHECK-NEXT:    sub.w r4, sp, #64
+; CHECK-NEXT:    bfc r4, #0, #4
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    vst1.64 {d8, d9, d10, d11}, [r4:128]!
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    vst1.64 {d12, d13, d14, d15}, [r4:128]
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    @ InlineAsm Start
+; CHECK-NEXT:    vld1.16 {d0, d1, d2, d3}, [r0]
+; CHECK-NEXT:    vld1.16 {d4, d5, d6, d7}, [r0]
+; CHECK-NEXT:    vabdl.s16 q4, d0, d4
+; CHECK-EMPTY:
+; CHECK-NEXT:    @ InlineAsm End
+; CHECK-NEXT:    vld1.64 {d8, d9, d10, d11}, [r4:128]!
+; CHECK-NEXT:    vld1.64 {d12, d13, d14, d15}, [r4:128]
+; CHECK-NEXT:    subs r4, r7, #4
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r7, pc}
+  tail call void asm sideeffect "vld1.i16 {q0,q1}, [$0]\0Avld1.i16 {q2,q3}, [$1]\0Avabdl.s16 q4, d0, d4\0A", "r,r,r,~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{memory}"(ptr null, ptr null, ptr null)
+  ret i32 0
+}
+
+attributes #0 = { "frame-pointer"="non-leaf" }
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
index d652a540f3e9c..d17182131168c 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
@@ -5,9 +5,9 @@
 ; RUN: echo 'v1' > %t
 ; RUN: echo 'f _Z3foob' >> %t
 ;;
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t  > %bbsections
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig
-; RUN: diff -u %orig %bbsections
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t  > %t.bbsections
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %t.orig
+; RUN: diff -u %t.orig %t.bbsections
 
 define i32 @_Z3foob(i1 zeroext %0) nounwind {
   %2 = alloca i32, align 4
diff --git a/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll b/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll
new file mode 100644
index 0000000000000..51435b10fdc2a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/codeview-empty-dbg-cu-crash.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck %s
+
+; CHECK: .file	"<stdin>"
+; CHECK-NEXT: .section	.debug$S,"dr"
+; CHECK-NEXT: .p2align	2, 0x0
+; CHECK-NEXT: .long	4                               # Debug section magic
+; CHECK-NEXT: .long	241
+; CHECK-NEXT: .long	.Ltmp1-.Ltmp0                   # Subsection size
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .short	.Ltmp3-.Ltmp2                   # Record length
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: .short	4353                            # Record kind: S_OBJNAME
+; CHECK-NEXT: .long	0                               # Signature
+; CHECK-NEXT: .byte	0                               # Object name
+; CHECK-NEXT: .p2align	2, 0x0
+; CHECK-NEXT: .Ltmp3:
+; CHECK-NEXT: .short	.Ltmp5-.Ltmp4                   # Record length
+; CHECK-NEXT: .Ltmp4:
+; CHECK-NEXT: .short	4412                            # Record kind: S_COMPILE3
+; CHECK-NEXT: .long	3                               # Flags and language
+; CHECK-NEXT: .short	208                             # CPUType
+; CHECK-NEXT: .short	0                               # Frontend version
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	22000                           # Backend version
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .short	0
+; CHECK-NEXT: .asciz	"0"                             # Null-terminated compiler version string
+; CHECK-NEXT: .p2align	2, 0x0
+; CHECK-NEXT: .Ltmp5:
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .p2align	2, 0x0
+
+!llvm.dbg.cu = !{}
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
index 8185b77beb935..fcfff9ac5b63d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_wmma_w32.s
@@ -4,1906 +4,1906 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x00,0x5d,0xcc,0x00,0x05,0x12,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x04,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_a_reuse ; encoding: [0x04,0x20,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse ; encoding: [0x04,0x40,0x5d,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x62,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x62,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x62,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x62,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x63,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x63,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x63,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x63,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x1a,0x01,0x64,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x1a,0x02,0x64,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x1a,0x00,0x64,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x1a,0x04,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x1a,0x20,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x1a,0x40,0x64,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6a,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6a,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6b,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6b,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6c,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6c,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6d,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x6d,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6e,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6e,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x6f,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x6f,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x70,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x70,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x71,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x71,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], 1 ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x06,0x1a]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x72,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x72,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x60,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x60,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x60,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x60,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x10,0x01,0x61,0xcc,0x00,0x11,0x42,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x10,0x02,0x61,0xcc,0x00,0x11,0x42,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x61,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x61,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x66,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x66,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x68,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x68,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x68,0xcc,0x00,0x11,0x72,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x68,0xcc,0x00,0x11,0x72,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x66,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x69,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x69,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x69,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x73,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x74,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x75,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x76,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x77,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x78,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x79,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] ; encoding: [0x18,0x00,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 ; encoding: [0x18,0x08,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_a_reuse ; encoding: [0x18,0x20,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse ; encoding: [0x18,0x40,0x7a,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 ; encoding: [0x18,0x08,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] ; encoding: [0x18,0x00,0x7b,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_a_reuse ; encoding: [0x18,0x20,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse ; encoding: [0x18,0x40,0x7b,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 ; encoding: [0x18,0x00,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 ; encoding: [0x18,0x08,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x65,0xcc,0x00,0x11,0x82,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x65,0xcc,0x00,0x11,0x82,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_a_reuse ; encoding: [0x18,0x20,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse ; encoding: [0x18,0x40,0x65,0xcc,0x00,0x11,0x82,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 ; encoding: [0x18,0x00,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 ; encoding: [0x18,0x08,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] ; encoding: [0x18,0x01,0x67,0xcc,0x00,0x11,0x72,0x3c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] ; encoding: [0x18,0x02,0x67,0xcc,0x00,0x11,0x72,0x5c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_a_reuse ; encoding: [0x18,0x20,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse ; encoding: [0x18,0x40,0x67,0xcc,0x00,0x11,0x72,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x10,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:19], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x18,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:15], v[24:39], v[40:47] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x20,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x0c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:31], v[40:47] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x40,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x08,0x33,0xcc,0x08,0x31,0xa2,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], 1.0 ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xca,0x03]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_lo:[0,0,1] ; encoding: [0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x84]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47] neg_hi:[0,0,1] ; encoding: [0x00,0x04,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s1, s2
-// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 2, -4
-// GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse
-// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5]
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5]
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 2, -4
-// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse
-// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x84,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x84,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x85,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_fp8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x85,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x86,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_fp8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x86,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_a_reuse ; encoding: [0x10,0x20,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], v[16:19] matrix_b_reuse ; encoding: [0x10,0x40,0x87,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x80,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x80,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x81,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_fp8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x81,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x82,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_fp8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x82,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], 1.0 neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_lo:[0,0,1] ; encoding: [0x10,0x00,0x83,0xcc,0x00,0x11,0x42,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] neg_hi:[0,0,1] ; encoding: [0x10,0x04,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_a_reuse ; encoding: [0x10,0x20,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_16x16x128_bf8_bf8 v[16:23], v[0:15], v[8:23], v[16:23] matrix_b_reuse ; encoding: [0x10,0x40,0x83,0xcc,0x00,0x11,0x42,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x1b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], 1.0 neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0xca,0x9b]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x88,0xcc,0x00,0x05,0x12,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW0
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
 
 v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E8 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1]
-// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
-// WAVESIZE-ERR: :[[@LINE-2]]:1: error: instruction requires wavesize=32
-// GFX12-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// GFX12-ERR: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+// WAVESIZE-ERR: :[[@LINE-3]]:1: error: instruction requires wavesize=32
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
index a409dac321f83..5d73cbd512edb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_wmma_w32.txt
@@ -586,233 +586,233 @@
 0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c
 # GFX1250: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] ; encoding: [0x10,0x00,0x72,0xcc,0x00,0x11,0x42,0x3c]
 
-0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x00]
+0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 2, -4      ; encoding: [0x00,0x00,0x3a,0xcc,0x82,0x88,0x01,0x04]
 
-0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x00]
+0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 s[2:3], s[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x08,0x00,0x04]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04]
 
-0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x28]
+0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x3a,0xcc,0x02,0x09,0x02,0x2c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x08]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x0c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x48]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x4c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x28]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x2c]
 
-0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00]
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04]
 
-0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00
-# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00]
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04
+# GFX1250: v_wmma_ld_scale16_paired_b64 v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x00]
+0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 2, -4        ; encoding: [0x00,0x00,0x35,0xcc,0x82,0x88,0x01,0x04]
 
-0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08
-# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c
+# GFX1250: v_wmma_ld_scale_paired_b32 s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c]
 
-0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x00]
+0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 s1, s2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x04,0x00,0x04]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2       ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04]
 
-0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x28]
+0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse matrix_b_reuse ; encoding: [0x00,0x6a,0x35,0xcc,0x01,0x05,0x02,0x2c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x08]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x48]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x4c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x28]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x2c]
 
-0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00]
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04]
 
-0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00
-# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00]
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04
+# GFX1250: v_wmma_ld_scale_paired_b32 v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s[0:1], s[0:1] matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s[0:1], s[0:1] matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s[2:3], s[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v[2:3], v[4:5] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale16_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x18,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:11], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x10,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x1c]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:11], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP6 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x14]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x08,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_BF8 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x0c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:15], v[0:7], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x00,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:15], v[0:7], v[0:7], s0, s0 matrix_b_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[0:7], v[0:15], v[0:7], s0, s0 matrix_a_fmt:MATRIX_FMT_FP4 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x20,0x33,0xcc,0x00,0x01,0x02,0x04]
 
-0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], s1, s2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
+0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:35], v[40:47], v1, v2 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x0c,0x33,0xcc,0x08,0x31,0xa2,0x94]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
-0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
-# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04
+# GFX1250: v_wmma_scale_f32_16x16x128_f8f6f4 v[0:7], v[8:23], v[24:39], v[40:47], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x00,0x33,0xcc,0x08,0x31,0xa2,0x04]
 
 0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b
 # GFX1250: v_wmma_f16_16x16x128_bf8_bf8 v[16:19], v[0:15], v[8:23], 1.0 ; encoding: [0x10,0x00,0x87,0xcc,0x00,0x11,0xca,0x1b]
@@ -1000,92 +1000,92 @@
 0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c
 # GFX1250: v_wmma_f32_32x16x128_f4 v[4:19], v[0:15], v[2:9], v[4:19] neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x88,0xcc,0x00,0x05,0x12,0x9c]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_reuse ; encoding: [0x00,0x20,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s0, s0 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x35,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s1, s2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x04,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x35,0xcc,0x01,0x05,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x35,0xcc,0x01,0x05,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v1, v2 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x35,0xcc,0x01,0x05,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_reuse ; encoding: [0x00,0x20,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x08,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_reuse ; encoding: [0x00,0x28,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x00,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x04,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 ; encoding: [0x00,0x00,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x08,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
+0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[0:7], v[0:15], s[0:1], s[0:1] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_b_reuse ; encoding: [0x00,0x40,0x3a,0xcc,0x00,0x00,0x00,0x0c,0x00,0x40,0x88,0xcc,0x08,0x01,0x02,0x1c]
 
-0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], s[2:3], s[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x08,0x00,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse matrix_b_reuse neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x68,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x08,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
+0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] ; encoding: [0x00,0x08,0x3a,0xcc,0x02,0x09,0x02,0x0c,0x00,0x44,0x88,0xcc,0x08,0x31,0xa2,0x9c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x40,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x44,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x20,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x00,0x3a,0xcc,0x02,0x09,0x02,0x24,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 ; encoding: [0x00,0x02,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
 
-0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
-# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x00,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
+0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c
+# GFX1250: v_wmma_scale16_f32_32x16x128_f4 v[0:15], v[8:23], v[24:31], v[40:55], v[2:3], v[4:5] matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 ; encoding: [0x00,0x01,0x3a,0xcc,0x02,0x09,0x02,0x04,0x00,0x40,0x88,0xcc,0x08,0x31,0xa2,0x1c]
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
index 8cef3c1fd8569..71dad85bbaddd 100644
--- a/llvm/utils/lit/lit/LitConfig.py
+++ b/llvm/utils/lit/lit/LitConfig.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 import inspect
 import os
+import enum
 import platform
 import sys
 
@@ -25,7 +26,7 @@ def __init__(
         self,
         progname,
         path,
-        quiet,
+        diagnostic_level,
         useValgrind,
         valgrindLeakCheck,
         valgrindArgs,
@@ -46,7 +47,7 @@ def __init__(
         self.progname = progname
         # The items to add to the PATH environment variable.
         self.path = [str(p) for p in path]
-        self.quiet = bool(quiet)
+        self.diagnostic_level = diagnostic_level
         self.useValgrind = bool(useValgrind)
         self.valgrindLeakCheck = bool(valgrindLeakCheck)
         self.valgrindUserArgs = list(valgrindArgs)
@@ -155,8 +156,7 @@ def per_test_coverage(self, value):
     def load_config(self, config, path):
         """load_config(config, path) - Load a config object from an alternate
         path."""
-        if self.debug:
-            self.note("load_config from %r" % path)
+        self.dbg("load_config from %r" % path)
         config.load_from_path(path, self)
         return config
 
@@ -209,6 +209,8 @@ def getToolsPath(self, dir, paths, tools):
         return dir
 
     def _write_message(self, kind, message):
+        if not self.diagnostic_level_enabled(kind):
+            return
         # Get the file/line where this message was generated.
         f = inspect.currentframe()
         # Step out of _write_message, and then out of wrapper.
@@ -234,13 +236,21 @@ def substitute(self, string):
                 "unable to find %r parameter, use '--param=%s=VALUE'" % (key, key)
             )
 
+    def diagnostic_level_enabled(self, kind):
+        if kind == "debug":
+            return self.debug
+        return DiagnosticLevel.create(self.diagnostic_level) >= DiagnosticLevel.create(
+            kind
+        )
+
+    def dbg(self, message):
+        self._write_message("debug", message)
+
     def note(self, message):
-        if not self.quiet:
-            self._write_message("note", message)
+        self._write_message("note", message)
 
     def warning(self, message):
-        if not self.quiet:
-            self._write_message("warning", message)
+        self._write_message("warning", message)
         self.numWarnings += 1
 
     def error(self, message):
@@ -250,3 +260,25 @@ def error(self, message):
     def fatal(self, message):
         self._write_message("fatal", message)
         sys.exit(2)
+
+
+@enum.unique
+class DiagnosticLevel(enum.IntEnum):
+    FATAL = 0
+    ERROR = 1
+    WARNING = 2
+    NOTE = 3
+
+    @classmethod
+    def create(cls, value):
+        if value == "fatal":
+            return cls.FATAL
+        if value == "error":
+            return cls.ERROR
+        if value == "warning":
+            return cls.WARNING
+        if value == "note":
+            return cls.NOTE
+        raise ValueError(
+            f"invalid diagnostic level {repr(value)} of type {type(value)}"
+        )
diff --git a/llvm/utils/lit/lit/LitTestCase.py b/llvm/utils/lit/lit/LitTestCase.py
index 566d068ad11ea..690b7cb6f13d5 100644
--- a/llvm/utils/lit/lit/LitTestCase.py
+++ b/llvm/utils/lit/lit/LitTestCase.py
@@ -46,7 +46,7 @@ def load_test_suite(inputs):
     lit_config = lit.LitConfig.LitConfig(
         progname="lit",
         path=[],
-        quiet=False,
+        diagnostic_level="note",
         useValgrind=False,
         valgrindLeakCheck=False,
         valgrindArgs=[],
diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
index c250838250547..e7e545cc8e300 100644
--- a/llvm/utils/lit/lit/TestingConfig.py
+++ b/llvm/utils/lit/lit/TestingConfig.py
@@ -143,8 +143,7 @@ def load_from_path(self, path, litConfig):
         cfg_globals["__file__"] = path
         try:
             exec(compile(data, path, "exec"), cfg_globals, None)
-            if litConfig.debug:
-                litConfig.note("... loaded config %r" % path)
+            litConfig.dbg("... loaded config %r" % path)
         except SystemExit:
             e = sys.exc_info()[1]
             # We allow normal system exit inside a config file to just
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index 8238bc42395af..5c2ff4e70a3aa 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -15,6 +15,59 @@ class TestOrder(enum.Enum):
     SMART = "smart"
 
 
+@enum.unique
+class TestOutputLevel(enum.IntEnum):
+    OFF = 0
+    FAILED = 1
+    ALL = 2
+
+    @classmethod
+    def create(cls, value):
+        if value == "off":
+            return cls.OFF
+        if value == "failed":
+            return cls.FAILED
+        if value == "all":
+            return cls.ALL
+        raise ValueError(f"invalid output level {repr(value)} of type {type(value)}")
+
+
+class TestOutputAction(argparse.Action):
+    def __init__(self, option_strings, dest, **kwargs):
+        super().__init__(option_strings, dest, nargs=None, **kwargs)
+
+    def __call__(self, parser, namespace, value, option_string=None):
+        TestOutputAction.setOutputLevel(namespace, self.dest, value)
+
+    @classmethod
+    def setOutputLevel(cls, namespace, dest, value):
+        setattr(namespace, dest, value)
+        if dest == "test_output" and TestOutputLevel.create(
+            namespace.print_result_after
+        ) < TestOutputLevel.create(value):
+            setattr(namespace, "print_result_after", value)
+        elif dest == "print_result_after" and TestOutputLevel.create(
+            namespace.test_output
+        ) > TestOutputLevel.create(value):
+            setattr(namespace, "test_output", value)
+
+
+class AliasAction(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        self.expansion = kwargs.pop("alias", None)
+        if not self.expansion:
+            raise ValueError("no aliases expansion provided")
+        super().__init__(option_strings, dest, nargs=0, **kwargs)
+
+    def __call__(self, parser, namespace, value, option_string=None):
+        for e in self.expansion:
+            if callable(e):
+                e(namespace)
+            else:
+                dest, val = e
+                setattr(namespace, dest, val)
+
+
 def parse_args():
     parser = argparse.ArgumentParser(prog="lit", fromfile_prefix_chars="@")
     parser.add_argument(
@@ -55,41 +108,103 @@ def parse_args():
     )
 
     format_group = parser.add_argument_group("Output Format")
-    # FIXME: I find these names very confusing, although I like the
-    # functionality.
     format_group.add_argument(
-        "-q", "--quiet", help="Suppress no error output", action="store_true"
+        "--test-output",
+        help="Control whether the executed commands and their outputs are printed after each test has executed (default off). "
+        "If --print-result-after is set lower than the level given to --test-output, --print-result-after is raised to match.",
+        choices=["off", "failed", "all"],
+        default="off",
+        action=TestOutputAction,
+    )
+    format_group.add_argument(
+        "--print-result-after",
+        help="Control which the executed test names and results are printed after each test has executed (default all). "
+        "If --test-output is set higher than the level given to --print-result-after, --test-output is lowered to match.",
+        choices=["off", "failed", "all"],
+        default="all",
+        action=TestOutputAction,
+    )
+    format_group.add_argument(
+        "--diagnostic-level",
+        help="Control how verbose lit diagnostics should be (default note)",
+        choices=["error", "warning", "note"],
+        default="note",
+    )
+    format_group.add_argument(
+        "--terse-summary",
+        help="Print the elapsed time and the number of passed tests after all tests have finished (default on)",
+        action="store_true",
+        dest="terse_summary",
+    )
+    format_group.add_argument(
+        "--no-terse-summary",
+        help="Don't show the elapsed time after all tests have finished, and only show the number of failed tests.",
+        action="store_false",
+        dest="terse_summary",
+    )
+    parser.set_defaults(terse_summary=False)
+    format_group.add_argument(
+        "-q",
+        "--quiet",
+        help="Alias for '--diagnostic-level=error --test-output=off --terse-summary'",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "print_result_after", "failed"
+            ),
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "off"
+            ),
+            ("diagnostic_level", "error"),
+            ("terse_summary", True),
+        ],
     )
     format_group.add_argument(
         "-s",
         "--succinct",
-        help="Reduce amount of output."
-        " Additionally, show a progress bar,"
-        " unless --no-progress-bar is specified.",
-        action="store_true",
+        help="Alias for '--progress-bar --print-result-after=failed'",
+        action=AliasAction,
+        alias=[
+            ("useProgressBar", True),
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "print_result_after", "failed"
+            ),
+        ],
     )
     format_group.add_argument(
         "-v",
         "--verbose",
-        dest="showOutput",
         help="For failed tests, show all output. For example, each command is"
         " printed before it is executed, so the last printed command is the one"
-        " that failed.",
-        action="store_true",
+        " that failed. Alias for '--test-output=failed'",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "failed"
+            ),
+        ],
     )
     format_group.add_argument(
         "-vv",
         "--echo-all-commands",
-        dest="showOutput",
         help="Deprecated alias for -v.",
-        action="store_true",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "failed"
+            ),
+        ],
     )
     format_group.add_argument(
         "-a",
         "--show-all",
-        dest="showAllOutput",
-        help="Enable -v, but for all tests not just failed tests.",
-        action="store_true",
+        help="Enable -v, but for all tests not just failed tests. Alias for '--test-output=all'",
+        action=AliasAction,
+        alias=[
+            lambda namespace: TestOutputAction.setOutputLevel(
+                namespace, "test_output", "all"
+            ),
+        ],
     )
     format_group.add_argument(
         "-r",
@@ -105,10 +220,16 @@ def parse_args():
         help="Write test results to the provided path",
         metavar="PATH",
     )
+    format_group.add_argument(
+        "--progress-bar",
+        dest="useProgressBar",
+        help="Show curses based progress bar",
+        action="store_true",
+    )
     format_group.add_argument(
         "--no-progress-bar",
         dest="useProgressBar",
-        help="Do not use curses based progress bar",
+        help="Do not use curses based progress bar (default)",
         action="store_false",
     )
 
diff --git a/llvm/utils/lit/lit/discovery.py b/llvm/utils/lit/lit/discovery.py
index 2e7f90c6bb0c9..ac06223b45345 100644
--- a/llvm/utils/lit/lit/discovery.py
+++ b/llvm/utils/lit/lit/discovery.py
@@ -62,8 +62,7 @@ def search1(path):
                 cfgpath = target
 
         # We found a test suite, create a new config for it and load it.
-        if litConfig.debug:
-            litConfig.note("loading suite config %r" % cfgpath)
+        litConfig.dbg("loading suite config %r" % cfgpath)
 
         cfg = TestingConfig.fromdefaults(litConfig)
         cfg.load_from_path(cfgpath, litConfig)
@@ -115,8 +114,7 @@ def search1(path_in_suite):
         # Otherwise, copy the current config and load the local configuration
         # file into it.
         config = copy.deepcopy(parent)
-        if litConfig.debug:
-            litConfig.note("loading local config %r" % cfgpath)
+        litConfig.dbg("loading local config %r" % cfgpath)
         config.load_from_path(cfgpath, litConfig)
         return config
 
@@ -137,8 +135,7 @@ def getTests(path, litConfig, testSuiteCache, localConfigCache):
         litConfig.warning("unable to find test suite for %r" % path)
         return (), ()
 
-    if litConfig.debug:
-        litConfig.note("resolved input %r to %r::%r" % (path, ts.name, path_in_suite))
+    litConfig.dbg("resolved input %r to %r::%r" % (path, ts.name, path_in_suite))
 
     return ts, getTestsInSuite(
         ts,
diff --git a/llvm/utils/lit/lit/display.py b/llvm/utils/lit/lit/display.py
index b565bbc7a4f93..4dc04d93d3ea7 100644
--- a/llvm/utils/lit/lit/display.py
+++ b/llvm/utils/lit/lit/display.py
@@ -2,7 +2,7 @@
 
 
 def create_display(opts, tests, total_tests, workers):
-    if opts.quiet:
+    if opts.print_result_after == "off" and not opts.useProgressBar:
         return NopDisplay()
 
     num_tests = len(tests)
@@ -10,7 +10,7 @@ def create_display(opts, tests, total_tests, workers):
     header = "-- Testing: %d%s tests, %d workers --" % (num_tests, of_total, workers)
 
     progress_bar = None
-    if opts.succinct and opts.useProgressBar:
+    if opts.useProgressBar:
         import lit.ProgressBar
 
         try:
@@ -96,8 +96,8 @@ def update(self, test):
 
         show_result = (
             test.isFailure()
-            or self.opts.showAllOutput
-            or (not self.opts.quiet and not self.opts.succinct)
+            and self.opts.print_result_after == "failed"
+            or self.opts.print_result_after == "all"
         )
         if show_result:
             if self.progress_bar:
@@ -134,7 +134,9 @@ def print_result(self, test):
         )
 
         # Show the test failure output, if requested.
-        if (test.isFailure() and self.opts.showOutput) or self.opts.showAllOutput:
+        if (
+            test.isFailure() and self.opts.test_output == "failed"
+        ) or self.opts.test_output == "all":
             if test.isFailure():
                 print("%s TEST '%s' FAILED %s" % ("*" * 20, test_name, "*" * 20))
             out = test.result.output
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 913ba69d63328..59982c94b787c 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -53,7 +53,10 @@ def __init__(self, lit_config, config):
             self.use_lit_shell = True
 
             global lit_path_displayed
-            if not self.lit_config.quiet and lit_path_displayed is False:
+            if (
+                self.lit_config.diagnostic_level_enabled("note")
+                and lit_path_displayed is False
+            ):
                 self.lit_config.note("using lit tools: {}".format(path))
                 lit_path_displayed = True
 
@@ -527,7 +530,7 @@ def use_llvm_tool(
 
         if tool:
             tool = os.path.normpath(tool)
-            if not self.lit_config.quiet and not quiet:
+            if not quiet:
                 self.lit_config.note("using {}: {}".format(name, tool))
         return tool
 
@@ -637,10 +640,9 @@ def clang_setup(
                 ("%ms_abi_triple", self.make_msabi_triple(self.config.target_triple))
             )
         else:
-            if not self.lit_config.quiet:
-                self.lit_config.note(
-                    "No default target triple was found, some tests may fail as a result."
-                )
+            self.lit_config.note(
+                "No default target triple was found, some tests may fail as a result."
+            )
             self.config.substitutions.append(("%itanium_abi_triple", ""))
             self.config.substitutions.append(("%ms_abi_triple", ""))
 
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index a585cc0abdd48..07e809b168dc2 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -30,7 +30,7 @@ def main(builtin_params={}):
     lit_config = lit.LitConfig.LitConfig(
         progname=os.path.basename(sys.argv[0]),
         path=opts.path,
-        quiet=opts.quiet,
+        diagnostic_level=opts.diagnostic_level,
         useValgrind=opts.useValgrind,
         valgrindLeakCheck=opts.valgrindLeakCheck,
         valgrindArgs=opts.valgrindArgs,
@@ -332,7 +332,7 @@ def print_results(tests, elapsed, opts):
             opts.printPathRelativeCWD,
         )
 
-    print_summary(total_tests, tests_by_code, opts.quiet, elapsed)
+    print_summary(total_tests, tests_by_code, opts.terse_summary, elapsed)
 
 
 def print_group(tests, code, shown_codes, printPathRelativeCWD):
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/fail.txt b/llvm/utils/lit/tests/Inputs/verbosity/fail.txt
new file mode 100644
index 0000000000000..2bcca02683614
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/fail.txt
@@ -0,0 +1,2 @@
+RUN: echo "fail test output"
+RUN: fail
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/lit.cfg b/llvm/utils/lit/tests/Inputs/verbosity/lit.cfg
new file mode 100644
index 0000000000000..c3a1f4f4d873a
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/lit.cfg
@@ -0,0 +1,11 @@
+import lit.formats
+
+config.name = "verbosity"
+config.suffixes = [".txt"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
+lit_config.dbg("this is a debug log")
+lit_config.note("this is a note")
+lit_config.warning("this is a warning")
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/pass.txt b/llvm/utils/lit/tests/Inputs/verbosity/pass.txt
new file mode 100644
index 0000000000000..f64843827e147
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/pass.txt
@@ -0,0 +1 @@
+RUN: echo "pass test output"
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/unsupported.txt b/llvm/utils/lit/tests/Inputs/verbosity/unsupported.txt
new file mode 100644
index 0000000000000..f5ebd4da178f8
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/unsupported.txt
@@ -0,0 +1,2 @@
+REQUIRES: asdf
+RUN: not echo "unsupported test output"
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/xfail.txt b/llvm/utils/lit/tests/Inputs/verbosity/xfail.txt
new file mode 100644
index 0000000000000..85001cc22b08e
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/xfail.txt
@@ -0,0 +1,2 @@
+XFAIL: *
+RUN: not echo "xfail test output"
\ No newline at end of file
diff --git a/llvm/utils/lit/tests/Inputs/verbosity/xpass.txt b/llvm/utils/lit/tests/Inputs/verbosity/xpass.txt
new file mode 100644
index 0000000000000..87c95ec75ecdc
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/verbosity/xpass.txt
@@ -0,0 +1,2 @@
+XFAIL: *
+RUN: echo "xpass test output"
diff --git a/llvm/utils/lit/tests/lit-opts.py b/llvm/utils/lit/tests/lit-opts.py
index a533a59d9d124..0759c1d17be58 100644
--- a/llvm/utils/lit/tests/lit-opts.py
+++ b/llvm/utils/lit/tests/lit-opts.py
@@ -12,13 +12,13 @@
 
 # Check that LIT_OPTS understands multiple options with arbitrary spacing.
 #
-# RUN: env LIT_OPTS='-a -v  -Dvar=foobar' \
+# RUN: env LIT_OPTS='-v -a  -Dvar=foobar' \
 # RUN: %{lit} -s %{inputs}/lit-opts \
 # RUN: | FileCheck -check-prefix=SHOW-ALL -DVAR=foobar %s
 
 # Check that LIT_OPTS parses shell-like quotes and escapes.
 #
-# RUN: env LIT_OPTS='-a   -v -Dvar="foo bar"\ baz' \
+# RUN: env LIT_OPTS='-v   -a -Dvar="foo bar"\ baz' \
 # RUN: %{lit} -s %{inputs}/lit-opts \
 # RUN: | FileCheck -check-prefix=SHOW-ALL -DVAR="foo bar baz" %s
 
diff --git a/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py b/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py
index 189c1cebd623b..b3af606c52f18 100644
--- a/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py
+++ b/llvm/utils/lit/tests/per-test-coverage-by-lit-cfg.py
@@ -1,10 +1,10 @@
 # Test if lit_config.per_test_coverage in lit.cfg sets individual test case coverage.
 
-# RUN: %{lit} -a -vv -Dexecute_external=False \
+# RUN: %{lit} -a -Dexecute_external=False \
 # RUN:     %{inputs}/per-test-coverage-by-lit-cfg/per-test-coverage-by-lit-cfg.py | \
 # RUN:   FileCheck -DOUT=stdout %s
 
-# RUN: %{lit} -a -vv -Dexecute_external=True \
+# RUN: %{lit} -a -Dexecute_external=True \
 # RUN:     %{inputs}/per-test-coverage-by-lit-cfg/per-test-coverage-by-lit-cfg.py | \
 # RUN:   FileCheck -DOUT=stderr %s
 
diff --git a/llvm/utils/lit/tests/per-test-coverage.py b/llvm/utils/lit/tests/per-test-coverage.py
index cf5e82c44dc51..ba513554ae76e 100644
--- a/llvm/utils/lit/tests/per-test-coverage.py
+++ b/llvm/utils/lit/tests/per-test-coverage.py
@@ -1,10 +1,10 @@
 # Test LLVM_PROFILE_FILE is set when --per-test-coverage is passed to command line.
 
-# RUN: %{lit} -a -vv --per-test-coverage -Dexecute_external=False \
+# RUN: %{lit} -a --per-test-coverage -Dexecute_external=False \
 # RUN:     %{inputs}/per-test-coverage/per-test-coverage.py | \
 # RUN:   FileCheck -DOUT=stdout %s
 
-# RUN: %{lit} -a -vv --per-test-coverage -Dexecute_external=True \
+# RUN: %{lit} -a --per-test-coverage -Dexecute_external=True \
 # RUN:        %{inputs}/per-test-coverage/per-test-coverage.py | \
 # RUN:   FileCheck -DOUT=stderr %s
 
diff --git a/llvm/utils/lit/tests/shtest-cat.py b/llvm/utils/lit/tests/shtest-cat.py
index 5efe25c41684a..9763f9fbf1a9d 100644
--- a/llvm/utils/lit/tests/shtest-cat.py
+++ b/llvm/utils/lit/tests/shtest-cat.py
@@ -1,6 +1,6 @@
 ## Test the cat command.
 #
-# RUN: not %{lit} -a -v %{inputs}/shtest-cat \
+# RUN: not %{lit} -v %{inputs}/shtest-cat \
 # RUN: | FileCheck -match-full-lines %s
 # END.
 
diff --git a/llvm/utils/lit/tests/shtest-env-negative.py b/llvm/utils/lit/tests/shtest-env-negative.py
index c8b59b224e7c4..236c6a19e694b 100644
--- a/llvm/utils/lit/tests/shtest-env-negative.py
+++ b/llvm/utils/lit/tests/shtest-env-negative.py
@@ -1,6 +1,6 @@
 ## Test the env command (failing tests).
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-env-negative \
+# RUN: not %{lit} -v %{inputs}/shtest-env-negative \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-env-path.py b/llvm/utils/lit/tests/shtest-env-path.py
index bf459ae53fbc0..7f04756ed6ad5 100644
--- a/llvm/utils/lit/tests/shtest-env-path.py
+++ b/llvm/utils/lit/tests/shtest-env-path.py
@@ -1,9 +1,9 @@
 ## Tests env command for setting the PATH variable.
 
 # The test is using /bin/sh. Limit to system known to have /bin/sh.
-# REQUIRES: system-linux
+# REQUIRES: system-linux || system-darwin
 
-# RUN: %{lit} -a -v %{inputs}/shtest-env-path/path.txt \
+# RUN: %{lit} -a %{inputs}/shtest-env-path/path.txt \
 # RUN:   | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-env-positive.py b/llvm/utils/lit/tests/shtest-env-positive.py
index 4f07b69ecc7d3..089acd308c5c5 100644
--- a/llvm/utils/lit/tests/shtest-env-positive.py
+++ b/llvm/utils/lit/tests/shtest-env-positive.py
@@ -1,6 +1,6 @@
 ## Test the env command (passing tests).
 
-# RUN: %{lit} -a -v %{inputs}/shtest-env-positive \
+# RUN: %{lit} -a %{inputs}/shtest-env-positive \
 # RUN:   | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-export.py b/llvm/utils/lit/tests/shtest-export.py
index f2de8e8cd8b5f..d45a94a5eb830 100644
--- a/llvm/utils/lit/tests/shtest-export.py
+++ b/llvm/utils/lit/tests/shtest-export.py
@@ -1,6 +1,6 @@
 ## Test the export command.
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-export \
+# RUN: not %{lit} -v %{inputs}/shtest-export \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py
index aa4705b634a7d..ba609e036c166 100644
--- a/llvm/utils/lit/tests/shtest-glob.py
+++ b/llvm/utils/lit/tests/shtest-glob.py
@@ -1,6 +1,6 @@
 ## Tests glob pattern handling in echo command.
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-glob \
+# RUN: not %{lit} -v %{inputs}/shtest-glob \
 # RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s
 # END.
 
diff --git a/llvm/utils/lit/tests/shtest-not.py b/llvm/utils/lit/tests/shtest-not.py
index b42769ffd9383..e735d38260b37 100644
--- a/llvm/utils/lit/tests/shtest-not.py
+++ b/llvm/utils/lit/tests/shtest-not.py
@@ -1,6 +1,6 @@
 # Check the not command
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-not \
+# RUN: not %{lit} -a %{inputs}/shtest-not \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-pushd-popd.py b/llvm/utils/lit/tests/shtest-pushd-popd.py
index f917c1a4a4599..799e9d6d65951 100644
--- a/llvm/utils/lit/tests/shtest-pushd-popd.py
+++ b/llvm/utils/lit/tests/shtest-pushd-popd.py
@@ -1,6 +1,6 @@
 # Check the pushd and popd commands
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-pushd-popd \
+# RUN: not %{lit} -v %{inputs}/shtest-pushd-popd \
 # RUN: | FileCheck -match-full-lines %s
 #
 # END.
diff --git a/llvm/utils/lit/tests/shtest-readfile-external.py b/llvm/utils/lit/tests/shtest-readfile-external.py
index 6fe1088efd674..0d8e3ad1242bf 100644
--- a/llvm/utils/lit/tests/shtest-readfile-external.py
+++ b/llvm/utils/lit/tests/shtest-readfile-external.py
@@ -4,7 +4,7 @@
 # ALLOW_RETRIES: 2
 
 # UNSUPPORTED: system-windows
-# RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s
+# RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s
 
 # CHECK: -- Testing: 5 tests{{.*}}
 
diff --git a/llvm/utils/lit/tests/shtest-readfile.py b/llvm/utils/lit/tests/shtest-readfile.py
index 218da2257bcff..ca57db82e6617 100644
--- a/llvm/utils/lit/tests/shtest-readfile.py
+++ b/llvm/utils/lit/tests/shtest-readfile.py
@@ -3,7 +3,7 @@
 # TODO(boomanaiden154): This sometimes fails, possibly due to buffers not being flushed.
 # ALLOW_RETRIES: 2
 
-# RUN: env LIT_USE_INTERNAL_SHELL=1  not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s
+# RUN: env LIT_USE_INTERNAL_SHELL=1  not %{lit} -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s
 
 # CHECK: -- Testing: 5 tests{{.*}}
 
diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
index d81cde0159792..d5340a7d2efb9 100644
--- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -4,7 +4,7 @@
 # These tests are specific to options that Darwin does not support.
 # UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
+# RUN: not %{lit} -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
 
 # CHECK: -- Testing: 2 tests{{.*}}
 
diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py
index 21e5a5e2491d1..582477bef65fc 100644
--- a/llvm/utils/lit/tests/shtest-ulimit.py
+++ b/llvm/utils/lit/tests/shtest-ulimit.py
@@ -8,7 +8,7 @@
 # RUN: %{python} %S/Inputs/shtest-ulimit/print_limits.py | grep RLIMIT_NOFILE \
 # RUN:   | sed -n -e 's/.*=//p' | tr -d '\n' > %t.nofile_limit
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical \
+# RUN: not %{lit} -v %{inputs}/shtest-ulimit --order=lexical \
 # RUN:   | FileCheck -DBASE_NOFILE_LIMIT=%{readfile:%t.nofile_limit} %s
 
 # CHECK: -- Testing: 3 tests{{.*}}
diff --git a/llvm/utils/lit/tests/shtest-umask.py b/llvm/utils/lit/tests/shtest-umask.py
index e67f0308db661..8af81ec3b4ebd 100644
--- a/llvm/utils/lit/tests/shtest-umask.py
+++ b/llvm/utils/lit/tests/shtest-umask.py
@@ -1,6 +1,6 @@
 # Check the umask command
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-umask | FileCheck -match-full-lines %s
+# RUN: not %{lit} -v %{inputs}/shtest-umask | FileCheck -match-full-lines %s
 # TODO(boomanaiden154): We should be asserting that we get expected behavior
 # on Windows rather than just listing this as unsupported.
 # UNSUPPORTED: system-windows
diff --git a/llvm/utils/lit/tests/unit/TestRunner.py b/llvm/utils/lit/tests/unit/TestRunner.py
index 09470c7b9386e..a3fa62e1ef0e1 100644
--- a/llvm/utils/lit/tests/unit/TestRunner.py
+++ b/llvm/utils/lit/tests/unit/TestRunner.py
@@ -30,7 +30,7 @@ def load_keyword_parser_lit_tests():
         lit_config = lit.LitConfig.LitConfig(
             progname="lit",
             path=[],
-            quiet=False,
+            diagnostic_level="note",
             useValgrind=False,
             valgrindLeakCheck=False,
             valgrindArgs=[],
diff --git a/llvm/utils/lit/tests/verbosity.py b/llvm/utils/lit/tests/verbosity.py
new file mode 100644
index 0000000000000..62baf618e2aca
--- /dev/null
+++ b/llvm/utils/lit/tests/verbosity.py
@@ -0,0 +1,1130 @@
+# Test various combinations of options controlling lit stdout and stderr output
+
+# RUN: mkdir -p %t
+
+### Test default
+
+# RUN: not %{lit} %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# NO-ARGS:      -- Testing: 5 tests, 1 workers --
+# NO-ARGS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# NO-ARGS-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# NO-ARGS-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# NO-ARGS-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# NO-ARGS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# NO-ARGS-NEXT: ********************
+# NO-ARGS-NEXT: Failed Tests (1):
+# NO-ARGS-NEXT:   verbosity :: fail.txt
+# NO-ARGS-EMPTY:
+# NO-ARGS-NEXT: ********************
+# NO-ARGS-NEXT: Unexpectedly Passed Tests (1):
+# NO-ARGS-NEXT:   verbosity :: xpass.txt
+# NO-ARGS-EMPTY:
+# NO-ARGS-EMPTY:
+# NO-ARGS-NEXT: Testing Time: {{.*}}s
+# NO-ARGS-EMPTY:
+# NO-ARGS-NEXT: Total Discovered Tests: 5
+# NO-ARGS-NEXT:   Unsupported        : 1 (20.00%)
+# NO-ARGS-NEXT:   Passed             : 1 (20.00%)
+# NO-ARGS-NEXT:   Expectedly Failed  : 1 (20.00%)
+# NO-ARGS-NEXT:   Failed             : 1 (20.00%)
+# NO-ARGS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# NO-ARGS-ERR: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: note: this is a note
+# NO-ARGS-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: warning: this is a warning
+# NO-ARGS-ERR-EMPTY:
+# NO-ARGS-ERR-NEXT: 1 warning(s) in tests
+
+
+### Test aliases
+
+# RUN: not %{lit} --succinct %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SUCCINCT < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SUCCINCT:      -- Testing: 5 tests, 1 workers --
+# SUCCINCT-NEXT: Testing:
+# SUCCINCT-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SUCCINCT-NEXT: Testing:
+# SUCCINCT-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SUCCINCT-NEXT: Testing:
+# SUCCINCT-NEXT: ********************
+# SUCCINCT-NEXT: Failed Tests (1):
+# SUCCINCT-NEXT:   verbosity :: fail.txt
+# SUCCINCT-EMPTY:
+# SUCCINCT-NEXT: ********************
+# SUCCINCT-NEXT: Unexpectedly Passed Tests (1):
+# SUCCINCT-NEXT:   verbosity :: xpass.txt
+# SUCCINCT-EMPTY:
+# SUCCINCT-EMPTY:
+# SUCCINCT-NEXT: Testing Time: {{.*}}s
+# SUCCINCT-EMPTY:
+# SUCCINCT-NEXT: Total Discovered Tests: 5
+# SUCCINCT-NEXT:   Unsupported        : 1 (20.00%)
+# SUCCINCT-NEXT:   Passed             : 1 (20.00%)
+# SUCCINCT-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SUCCINCT-NEXT:   Failed             : 1 (20.00%)
+# SUCCINCT-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --verbose %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix VERBOSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# VERBOSE:      -- Testing: 5 tests, 1 workers --
+# VERBOSE-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# VERBOSE-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# VERBOSE-NEXT: Exit Code: 127
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Command Output (stdout):
+# VERBOSE-NEXT: --
+# VERBOSE-NEXT: # {{R}}UN: at line 1
+# VERBOSE-NEXT: echo "fail test output"
+# VERBOSE-NEXT: # executed command: echo 'fail test output'
+# VERBOSE-NEXT: # .---command stdout------------
+# VERBOSE-NEXT: # | fail test output
+# VERBOSE-NEXT: # `-----------------------------
+# VERBOSE-NEXT: # {{R}}UN: at line 2
+# VERBOSE-NEXT: fail
+# VERBOSE-NEXT: # executed command: fail
+# VERBOSE-NEXT: # .---command stderr------------
+# VERBOSE-NEXT: # | 'fail': command not found
+# VERBOSE-NEXT: # `-----------------------------
+# VERBOSE-NEXT: # error: command failed with exit status: 127
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: --
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# VERBOSE-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# VERBOSE-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# VERBOSE-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# VERBOSE-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# VERBOSE-NEXT: Exit Code: 0
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Command Output (stdout):
+# VERBOSE-NEXT: --
+# VERBOSE-NEXT: # {{R}}UN: at line 2
+# VERBOSE-NEXT: echo "xpass test output"
+# VERBOSE-NEXT: # executed command: echo 'xpass test output'
+# VERBOSE-NEXT: # .---command stdout------------
+# VERBOSE-NEXT: # | xpass test output
+# VERBOSE-NEXT: # `-----------------------------
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: --
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: Failed Tests (1):
+# VERBOSE-NEXT:   verbosity :: fail.txt
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: ********************
+# VERBOSE-NEXT: Unexpectedly Passed Tests (1):
+# VERBOSE-NEXT:   verbosity :: xpass.txt
+# VERBOSE-EMPTY:
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Testing Time: {{.*}}s
+# VERBOSE-EMPTY:
+# VERBOSE-NEXT: Total Discovered Tests: 5
+# VERBOSE-NEXT:   Unsupported        : 1 (20.00%)
+# VERBOSE-NEXT:   Passed             : 1 (20.00%)
+# VERBOSE-NEXT:   Expectedly Failed  : 1 (20.00%)
+# VERBOSE-NEXT:   Failed             : 1 (20.00%)
+# VERBOSE-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --show-all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SHOW-ALL:      -- Testing: 5 tests, 1 workers --
+# SHOW-ALL-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SHOW-ALL-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# SHOW-ALL-NEXT: Exit Code: 127
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 1
+# SHOW-ALL-NEXT: echo "fail test output"
+# SHOW-ALL-NEXT: # executed command: echo 'fail test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | fail test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-NEXT: # {{R}}UN: at line 2
+# SHOW-ALL-NEXT: fail
+# SHOW-ALL-NEXT: # executed command: fail
+# SHOW-ALL-NEXT: # .---command stderr------------
+# SHOW-ALL-NEXT: # | 'fail': command not found
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-NEXT: # error: command failed with exit status: 127
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SHOW-ALL-NEXT: Exit Code: 0
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 1
+# SHOW-ALL-NEXT: echo "pass test output"
+# SHOW-ALL-NEXT: # executed command: echo 'pass test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | pass test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SHOW-ALL-NEXT: Test requires the following unavailable features: asdf
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SHOW-ALL-NEXT: Exit Code: 1
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 2
+# SHOW-ALL-NEXT: not echo "xfail test output"
+# SHOW-ALL-NEXT: # executed command: not echo 'xfail test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | xfail test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-NEXT: # error: command failed with exit status: 1
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SHOW-ALL-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# SHOW-ALL-NEXT: Exit Code: 0
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Command Output (stdout):
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-NEXT: # {{R}}UN: at line 2
+# SHOW-ALL-NEXT: echo "xpass test output"
+# SHOW-ALL-NEXT: # executed command: echo 'xpass test output'
+# SHOW-ALL-NEXT: # .---command stdout------------
+# SHOW-ALL-NEXT: # | xpass test output
+# SHOW-ALL-NEXT: # `-----------------------------
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: --
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: Failed Tests (1):
+# SHOW-ALL-NEXT:   verbosity :: fail.txt
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: ********************
+# SHOW-ALL-NEXT: Unexpectedly Passed Tests (1):
+# SHOW-ALL-NEXT:   verbosity :: xpass.txt
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Testing Time: {{.*}}s
+# SHOW-ALL-EMPTY:
+# SHOW-ALL-NEXT: Total Discovered Tests: 5
+# SHOW-ALL-NEXT:   Unsupported        : 1 (20.00%)
+# SHOW-ALL-NEXT:   Passed             : 1 (20.00%)
+# SHOW-ALL-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SHOW-ALL-NEXT:   Failed             : 1 (20.00%)
+# SHOW-ALL-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --quiet %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QUIET:      -- Testing: 5 tests, 1 workers --
+# QUIET-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QUIET-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QUIET-NEXT: ********************
+# QUIET-NEXT: Failed Tests (1):
+# QUIET-NEXT:   verbosity :: fail.txt
+# QUIET-EMPTY:
+# QUIET-NEXT: ********************
+# QUIET-NEXT: Unexpectedly Passed Tests (1):
+# QUIET-NEXT:   verbosity :: xpass.txt
+# QUIET-EMPTY:
+# QUIET-EMPTY:
+# QUIET-NEXT: Total Discovered Tests: 5
+# QUIET-NEXT:   Failed             : 1 (20.00%)
+# QUIET-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# QUIET-ERR: 1 warning(s) in tests
+
+
+### Test log output
+
+# RUN: not %{lit} --debug %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix DEBUG < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix DEBUG-ERR --implicit-check-not lit < %t/stderr.txt
+
+# DEBUG:      -- Testing: 5 tests, 1 workers --
+# DEBUG-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# DEBUG-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# DEBUG-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# DEBUG-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# DEBUG-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# DEBUG-NEXT: ********************
+# DEBUG-NEXT: Failed Tests (1):
+# DEBUG-NEXT:   verbosity :: fail.txt
+# DEBUG-EMPTY:
+# DEBUG-NEXT: ********************
+# DEBUG-NEXT: Unexpectedly Passed Tests (1):
+# DEBUG-NEXT:   verbosity :: xpass.txt
+# DEBUG-EMPTY:
+# DEBUG-EMPTY:
+# DEBUG-NEXT: Testing Time: {{.*}}s
+# DEBUG-EMPTY:
+# DEBUG-NEXT: Total Discovered Tests: 5
+# DEBUG-NEXT:   Unsupported        : 1 (20.00%)
+# DEBUG-NEXT:   Passed             : 1 (20.00%)
+# DEBUG-NEXT:   Expectedly Failed  : 1 (20.00%)
+# DEBUG-NEXT:   Failed             : 1 (20.00%)
+# DEBUG-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# DEBUG-ERR:      lit.py: {{.*}}discovery.py:{{[0-9]+}}: debug: loading suite config '{{.*}}lit.cfg'
+# DEBUG-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: debug: this is a debug log
+# DEBUG-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: note: this is a note
+# DEBUG-ERR-NEXT: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: warning: this is a warning
+# DEBUG-ERR-NEXT: lit.py: {{.*}}TestingConfig.py:{{[0-9]+}}: debug: ... loaded config '{{.*}}lit.cfg'
+# DEBUG-ERR-NEXT: lit.py: {{.*}}discovery.py:{{[0-9]+}}: debug: resolved input '{{.*}}verbosity' to 'verbosity'::()
+# DEBUG-ERR-EMPTY:
+# DEBUG-ERR-NEXT: 1 warning(s) in tests
+
+
+# RUN: not %{lit} --diagnostic-level note %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --diagnostic-level warning %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix WARNING-ERR --implicit-check-not lit < %t/stderr.txt
+
+# WARNING-ERR: lit.py: {{.*}}lit.cfg:{{[0-9]+}}: warning: this is a warning
+# WARNING-ERR-EMPTY:
+# WARNING-ERR-NEXT: 1 warning(s) in tests
+
+# RUN: not %{lit} --diagnostic-level error %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix ERROR-ERR --implicit-check-not lit < %t/stderr.txt
+
+# ERROR-ERR: 1 warning(s) in tests
+
+
+### Test --test-output
+
+# RUN: not %{lit} --test-output off  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --test-output failed  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix VERBOSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# TEST-OUTPUT-OFF:      -- Testing: 5 tests, 1 workers --
+# TEST-OUTPUT-OFF-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# TEST-OUTPUT-OFF-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# TEST-OUTPUT-OFF-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# TEST-OUTPUT-OFF-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# TEST-OUTPUT-OFF-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# TEST-OUTPUT-OFF-NEXT: ********************
+# TEST-OUTPUT-OFF-NEXT: Failed Tests (1):
+# TEST-OUTPUT-OFF-NEXT:   verbosity :: fail.txt
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-NEXT: ********************
+# TEST-OUTPUT-OFF-NEXT: Unexpectedly Passed Tests (1):
+# TEST-OUTPUT-OFF-NEXT:   verbosity :: xpass.txt
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-NEXT: Testing Time: {{.*}}s
+# TEST-OUTPUT-OFF-EMPTY:
+# TEST-OUTPUT-OFF-NEXT: Total Discovered Tests: 5
+# TEST-OUTPUT-OFF-NEXT:   Unsupported        : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Passed             : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Expectedly Failed  : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Failed             : 1 (20.00%)
+# TEST-OUTPUT-OFF-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --test-output all  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+
+### Test --print-result-after
+
+# RUN: not %{lit} --print-result-after off  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-OFF < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RESULT-OFF:      ********************
+# RESULT-OFF-NEXT: Failed Tests (1):
+# RESULT-OFF-NEXT:   verbosity :: fail.txt
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-NEXT: ********************
+# RESULT-OFF-NEXT: Unexpectedly Passed Tests (1):
+# RESULT-OFF-NEXT:   verbosity :: xpass.txt
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-NEXT: Testing Time: {{.*}}s
+# RESULT-OFF-EMPTY:
+# RESULT-OFF-NEXT: Total Discovered Tests: 5
+# RESULT-OFF-NEXT:   Unsupported        : 1 (20.00%)
+# RESULT-OFF-NEXT:   Passed             : 1 (20.00%)
+# RESULT-OFF-NEXT:   Expectedly Failed  : 1 (20.00%)
+# RESULT-OFF-NEXT:   Failed             : 1 (20.00%)
+# RESULT-OFF-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} --print-result-after failed  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-FAILED < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RESULT-FAILED:      -- Testing: 5 tests, 1 workers --
+# RESULT-FAILED-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# RESULT-FAILED-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# RESULT-FAILED-NEXT: ********************
+# RESULT-FAILED-NEXT: Failed Tests (1):
+# RESULT-FAILED-NEXT:   verbosity :: fail.txt
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-NEXT: ********************
+# RESULT-FAILED-NEXT: Unexpectedly Passed Tests (1):
+# RESULT-FAILED-NEXT:   verbosity :: xpass.txt
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-NEXT: Testing Time: {{.*}}s
+# RESULT-FAILED-EMPTY:
+# RESULT-FAILED-NEXT: Total Discovered Tests: 5
+# RESULT-FAILED-NEXT:   Unsupported        : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Passed             : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Expectedly Failed  : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Failed             : 1 (20.00%)
+# RESULT-FAILED-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} --print-result-after all  %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+
+### Test combinations of --print-result-after followed by --test-output
+
+# RUN: not %{lit} --print-result-after off --test-output failed %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-OFF-OUTPUT-FAILED < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RESULT-OFF-OUTPUT-FAILED:      -- Testing: 5 tests, 1 workers --
+# RESULT-OFF-OUTPUT-FAILED-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Exit Code: 127
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Command Output (stdout):
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # {{R}}UN: at line 1
+# RESULT-OFF-OUTPUT-FAILED-NEXT: echo "fail test output"
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # executed command: echo 'fail test output'
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # .---command stdout------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # | fail test output
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # `-----------------------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # {{R}}UN: at line 2
+# RESULT-OFF-OUTPUT-FAILED-NEXT: fail
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # executed command: fail
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # .---command stderr------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # | 'fail': command not found
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # `-----------------------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # error: command failed with exit status: 127
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Exit Code: 0
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Command Output (stdout):
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # {{R}}UN: at line 2
+# RESULT-OFF-OUTPUT-FAILED-NEXT: echo "xpass test output"
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # executed command: echo 'xpass test output'
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # .---command stdout------------
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # | xpass test output
+# RESULT-OFF-OUTPUT-FAILED-NEXT: # `-----------------------------
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: --
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Failed Tests (1):
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   verbosity :: fail.txt
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: ********************
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Unexpectedly Passed Tests (1):
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   verbosity :: xpass.txt
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Testing Time: {{.*}}s
+# RESULT-OFF-OUTPUT-FAILED-EMPTY:
+# RESULT-OFF-OUTPUT-FAILED-NEXT: Total Discovered Tests: 5
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Unsupported        : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Passed             : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Expectedly Failed  : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Failed             : 1 (20.00%)
+# RESULT-OFF-OUTPUT-FAILED-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --print-result-after all --test-output off %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --print-result-after failed --test-output all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+
+### Test combinations of --test-output followed by --print-result-after
+
+# RUN: not %{lit} --test-output failed --print-result-after off %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix RESULT-OFF < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --test-output off --print-result-after all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --test-output all --print-result-after failed %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix OUTPUT-ALL-RESULT-FAILED < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# OUTPUT-ALL-RESULT-FAILED:      -- Testing: 5 tests, 1 workers --
+# OUTPUT-ALL-RESULT-FAILED-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Exit Code: 127
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Command Output (stdout):
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # {{R}}UN: at line 1
+# OUTPUT-ALL-RESULT-FAILED-NEXT: echo "fail test output"
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # executed command: echo 'fail test output'
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # .---command stdout------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # | fail test output
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # `-----------------------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # {{R}}UN: at line 2
+# OUTPUT-ALL-RESULT-FAILED-NEXT: fail
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # executed command: fail
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # .---command stderr------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # | 'fail': command not found
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # `-----------------------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # error: command failed with exit status: 127
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Exit Code: 0
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Command Output (stdout):
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # {{R}}UN: at line 2
+# OUTPUT-ALL-RESULT-FAILED-NEXT: echo "xpass test output"
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # executed command: echo 'xpass test output'
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # .---command stdout------------
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # | xpass test output
+# OUTPUT-ALL-RESULT-FAILED-NEXT: # `-----------------------------
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: --
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Failed Tests (1):
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   verbosity :: fail.txt
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: ********************
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Unexpectedly Passed Tests (1):
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   verbosity :: xpass.txt
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Testing Time: {{.*}}
+# OUTPUT-ALL-RESULT-FAILED-EMPTY:
+# OUTPUT-ALL-RESULT-FAILED-NEXT: Total Discovered Tests: 5
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Unsupported        : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Passed             : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Expectedly Failed  : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Failed             : 1 (20.00%)
+# OUTPUT-ALL-RESULT-FAILED-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+### Test progress bar and terse summary in isolation
+
+# RUN: not %{lit} --progress-bar %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix PROGRESS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# PROGRESS:      -- Testing: 5 tests, 1 workers --
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# PROGRESS-NEXT: Testing:
+# PROGRESS-NEXT: ********************
+# PROGRESS-NEXT: Failed Tests (1):
+# PROGRESS-NEXT:   verbosity :: fail.txt
+# PROGRESS-EMPTY:
+# PROGRESS-NEXT: ********************
+# PROGRESS-NEXT: Unexpectedly Passed Tests (1):
+# PROGRESS-NEXT:   verbosity :: xpass.txt
+# PROGRESS-EMPTY:
+# PROGRESS-EMPTY:
+# PROGRESS-NEXT: Testing Time: {{.*}}s
+# PROGRESS-EMPTY:
+# PROGRESS-NEXT: Total Discovered Tests: 5
+# PROGRESS-NEXT:   Unsupported        : 1 (20.00%)
+# PROGRESS-NEXT:   Passed             : 1 (20.00%)
+# PROGRESS-NEXT:   Expectedly Failed  : 1 (20.00%)
+# PROGRESS-NEXT:   Failed             : 1 (20.00%)
+# PROGRESS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --terse-summary %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix TERSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# TERSE:      -- Testing: 5 tests, 1 workers --
+# TERSE-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# TERSE-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# TERSE-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# TERSE-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# TERSE-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# TERSE-NEXT: ********************
+# TERSE-NEXT: Failed Tests (1):
+# TERSE-NEXT:   verbosity :: fail.txt
+# TERSE-EMPTY:
+# TERSE-NEXT: ********************
+# TERSE-NEXT: Unexpectedly Passed Tests (1):
+# TERSE-NEXT:   verbosity :: xpass.txt
+# TERSE-EMPTY:
+# TERSE-EMPTY:
+# TERSE-NEXT: Total Discovered Tests: 5
+# TERSE-NEXT:   Failed             : 1 (20.00%)
+# TERSE-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+### Aliases in combination
+
+# RUN: not %{lit} -a -s %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix AS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# AS:      -- Testing: 5 tests, 1 workers --
+# AS-NEXT: Testing:
+# AS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# AS-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# AS-NEXT: Exit Code: 127
+# AS-EMPTY:
+# AS-NEXT: Command Output (stdout):
+# AS-NEXT: --
+# AS-NEXT: # {{R}}UN: at line 1
+# AS-NEXT: echo "fail test output"
+# AS-NEXT: # executed command: echo 'fail test output'
+# AS-NEXT: # .---command stdout------------
+# AS-NEXT: # | fail test output
+# AS-NEXT: # `-----------------------------
+# AS-NEXT: # {{R}}UN: at line 2
+# AS-NEXT: fail
+# AS-NEXT: # executed command: fail
+# AS-NEXT: # .---command stderr------------
+# AS-NEXT: # | 'fail': command not found
+# AS-NEXT: # `-----------------------------
+# AS-NEXT: # error: command failed with exit status: 127
+# AS-EMPTY:
+# AS-NEXT: --
+# AS-EMPTY:
+# AS-NEXT: ********************
+# AS-NEXT: Testing:
+# AS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# AS-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# AS-NEXT: Exit Code: 0
+# AS-EMPTY:
+# AS-NEXT: Command Output (stdout):
+# AS-NEXT: --
+# AS-NEXT: # {{R}}UN: at line 2
+# AS-NEXT: echo "xpass test output"
+# AS-NEXT: # executed command: echo 'xpass test output'
+# AS-NEXT: # .---command stdout------------
+# AS-NEXT: # | xpass test output
+# AS-NEXT: # `-----------------------------
+# AS-EMPTY:
+# AS-NEXT: --
+# AS-EMPTY:
+# AS-NEXT: ********************
+# AS-NEXT: Testing:
+# AS-NEXT: ********************
+# AS-NEXT: Failed Tests (1):
+# AS-NEXT:   verbosity :: fail.txt
+# AS-EMPTY:
+# AS-NEXT: ********************
+# AS-NEXT: Unexpectedly Passed Tests (1):
+# AS-NEXT:   verbosity :: xpass.txt
+# AS-EMPTY:
+# AS-EMPTY:
+# AS-NEXT: Testing Time: {{.*}}s
+# AS-EMPTY:
+# AS-NEXT: Total Discovered Tests: 5
+# AS-NEXT:   Unsupported        : 1 (20.00%)
+# AS-NEXT:   Passed             : 1 (20.00%)
+# AS-NEXT:   Expectedly Failed  : 1 (20.00%)
+# AS-NEXT:   Failed             : 1 (20.00%)
+# AS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} -s -a %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SA < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SA:      -- Testing: 5 tests, 1 workers --
+# SA-NEXT: Testing:
+# SA-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SA-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# SA-NEXT: Exit Code: 127
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 1
+# SA-NEXT: echo "fail test output"
+# SA-NEXT: # executed command: echo 'fail test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | fail test output
+# SA-NEXT: # `-----------------------------
+# SA-NEXT: # {{R}}UN: at line 2
+# SA-NEXT: fail
+# SA-NEXT: # executed command: fail
+# SA-NEXT: # .---command stderr------------
+# SA-NEXT: # | 'fail': command not found
+# SA-NEXT: # `-----------------------------
+# SA-NEXT: # error: command failed with exit status: 127
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:
+# SA-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SA-NEXT: Exit Code: 0
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 1
+# SA-NEXT: echo "pass test output"
+# SA-NEXT: # executed command: echo 'pass test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | pass test output
+# SA-NEXT: # `-----------------------------
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:
+# SA-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SA-NEXT: Test requires the following unavailable features: asdf
+# SA-NEXT: ********************
+# SA-NEXT: Testing:
+# SA-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SA-NEXT: Exit Code: 1
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 2
+# SA-NEXT: not echo "xfail test output"
+# SA-NEXT: # executed command: not echo 'xfail test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | xfail test output
+# SA-NEXT: # `-----------------------------
+# SA-NEXT: # error: command failed with exit status: 1
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:
+# SA-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SA-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# SA-NEXT: Exit Code: 0
+# SA-EMPTY:
+# SA-NEXT: Command Output (stdout):
+# SA-NEXT: --
+# SA-NEXT: # {{R}}UN: at line 2
+# SA-NEXT: echo "xpass test output"
+# SA-NEXT: # executed command: echo 'xpass test output'
+# SA-NEXT: # .---command stdout------------
+# SA-NEXT: # | xpass test output
+# SA-NEXT: # `-----------------------------
+# SA-EMPTY:
+# SA-NEXT: --
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Testing:
+# SA-NEXT: ********************
+# SA-NEXT: Failed Tests (1):
+# SA-NEXT:   verbosity :: fail.txt
+# SA-EMPTY:
+# SA-NEXT: ********************
+# SA-NEXT: Unexpectedly Passed Tests (1):
+# SA-NEXT:   verbosity :: xpass.txt
+# SA-EMPTY:
+# SA-EMPTY:
+# SA-NEXT: Testing Time: {{.*}}s
+# SA-EMPTY:
+# SA-NEXT: Total Discovered Tests: 5
+# SA-NEXT:   Unsupported        : 1 (20.00%)
+# SA-NEXT:   Passed             : 1 (20.00%)
+# SA-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SA-NEXT:   Failed             : 1 (20.00%)
+# SA-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} -q -a %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QA < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QA:      -- Testing: 5 tests, 1 workers --
+# QA-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QA-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# QA-NEXT: Exit Code: 127
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 1
+# QA-NEXT: echo "fail test output"
+# QA-NEXT: # executed command: echo 'fail test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | fail test output
+# QA-NEXT: # `-----------------------------
+# QA-NEXT: # {{R}}UN: at line 2
+# QA-NEXT: fail
+# QA-NEXT: # executed command: fail
+# QA-NEXT: # .---command stderr------------
+# QA-NEXT: # | 'fail': command not found
+# QA-NEXT: # `-----------------------------
+# QA-NEXT: # error: command failed with exit status: 127
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# QA-NEXT: Exit Code: 0
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 1
+# QA-NEXT: echo "pass test output"
+# QA-NEXT: # executed command: echo 'pass test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | pass test output
+# QA-NEXT: # `-----------------------------
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# QA-NEXT: Test requires the following unavailable features: asdf
+# QA-NEXT: ********************
+# QA-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# QA-NEXT: Exit Code: 1
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 2
+# QA-NEXT: not echo "xfail test output"
+# QA-NEXT: # executed command: not echo 'xfail test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | xfail test output
+# QA-NEXT: # `-----------------------------
+# QA-NEXT: # error: command failed with exit status: 1
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QA-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# QA-NEXT: Exit Code: 0
+# QA-EMPTY:
+# QA-NEXT: Command Output (stdout):
+# QA-NEXT: --
+# QA-NEXT: # {{R}}UN: at line 2
+# QA-NEXT: echo "xpass test output"
+# QA-NEXT: # executed command: echo 'xpass test output'
+# QA-NEXT: # .---command stdout------------
+# QA-NEXT: # | xpass test output
+# QA-NEXT: # `-----------------------------
+# QA-EMPTY:
+# QA-NEXT: --
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: ********************
+# QA-NEXT: Failed Tests (1):
+# QA-NEXT:   verbosity :: fail.txt
+# QA-EMPTY:
+# QA-NEXT: ********************
+# QA-NEXT: Unexpectedly Passed Tests (1):
+# QA-NEXT:   verbosity :: xpass.txt
+# QA-EMPTY:
+# QA-EMPTY:
+# QA-NEXT: Total Discovered Tests: 5
+# QA-NEXT:   Failed             : 1 (20.00%)
+# QA-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} -a -q %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} -sqav %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SQAV < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SQAV:      -- Testing: 5 tests, 1 workers --
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SQAV-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# SQAV-NEXT: Exit Code: 127
+# SQAV-EMPTY:
+# SQAV-NEXT: Command Output (stdout):
+# SQAV-NEXT: --
+# SQAV-NEXT: # {{R}}UN: at line 1
+# SQAV-NEXT: echo "fail test output"
+# SQAV-NEXT: # executed command: echo 'fail test output'
+# SQAV-NEXT: # .---command stdout------------
+# SQAV-NEXT: # | fail test output
+# SQAV-NEXT: # `-----------------------------
+# SQAV-NEXT: # {{R}}UN: at line 2
+# SQAV-NEXT: fail
+# SQAV-NEXT: # executed command: fail
+# SQAV-NEXT: # .---command stderr------------
+# SQAV-NEXT: # | 'fail': command not found
+# SQAV-NEXT: # `-----------------------------
+# SQAV-NEXT: # error: command failed with exit status: 127
+# SQAV-EMPTY:
+# SQAV-NEXT: --
+# SQAV-EMPTY:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SQAV-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# SQAV-NEXT: Exit Code: 0
+# SQAV-EMPTY:
+# SQAV-NEXT: Command Output (stdout):
+# SQAV-NEXT: --
+# SQAV-NEXT: # {{R}}UN: at line 2
+# SQAV-NEXT: echo "xpass test output"
+# SQAV-NEXT: # executed command: echo 'xpass test output'
+# SQAV-NEXT: # .---command stdout------------
+# SQAV-NEXT: # | xpass test output
+# SQAV-NEXT: # `-----------------------------
+# SQAV-EMPTY:
+# SQAV-NEXT: --
+# SQAV-EMPTY:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Testing:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Failed Tests (1):
+# SQAV-NEXT:   verbosity :: fail.txt
+# SQAV-EMPTY:
+# SQAV-NEXT: ********************
+# SQAV-NEXT: Unexpectedly Passed Tests (1):
+# SQAV-NEXT:   verbosity :: xpass.txt
+# SQAV-EMPTY:
+# SQAV-EMPTY:
+# SQAV-NEXT: Total Discovered Tests: 5
+# SQAV-NEXT:   Failed             : 1 (20.00%)
+# SQAV-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+### Aliases with specific overrides
+
+# RUN: not %{lit} --quiet --no-terse-summary %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-W-SUMMARY < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QUIET-W-SUMMARY:      -- Testing: 5 tests, 1 workers --
+# QUIET-W-SUMMARY-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QUIET-W-SUMMARY-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QUIET-W-SUMMARY-NEXT: ********************
+# QUIET-W-SUMMARY-NEXT: Failed Tests (1):
+# QUIET-W-SUMMARY-NEXT:   verbosity :: fail.txt
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-NEXT: ********************
+# QUIET-W-SUMMARY-NEXT: Unexpectedly Passed Tests (1):
+# QUIET-W-SUMMARY-NEXT:   verbosity :: xpass.txt
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-NEXT: Testing Time: {{.*}}s
+# QUIET-W-SUMMARY-EMPTY:
+# QUIET-W-SUMMARY-NEXT: Total Discovered Tests: 5
+# QUIET-W-SUMMARY-NEXT:   Unsupported        : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Passed             : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Expectedly Failed  : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Failed             : 1 (20.00%)
+# QUIET-W-SUMMARY-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+
+# RUN: not %{lit} --quiet --progress-bar %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-W-PROGRESS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# QUIET-W-PROGRESS: -- Testing: 5 tests, 1 workers --
+# QUIET-W-PROGRESS-NEXT: Testing:
+# QUIET-W-PROGRESS-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# QUIET-W-PROGRESS-NEXT: Testing:
+# QUIET-W-PROGRESS-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# QUIET-W-PROGRESS-NEXT: Testing:
+# QUIET-W-PROGRESS-NEXT: ********************
+# QUIET-W-PROGRESS-NEXT: Failed Tests (1):
+# QUIET-W-PROGRESS-NEXT:   verbosity :: fail.txt
+# QUIET-W-PROGRESS-EMPTY:
+# QUIET-W-PROGRESS-NEXT: ********************
+# QUIET-W-PROGRESS-NEXT: Unexpectedly Passed Tests (1):
+# QUIET-W-PROGRESS-NEXT:   verbosity :: xpass.txt
+# QUIET-W-PROGRESS-EMPTY:
+# QUIET-W-PROGRESS-EMPTY:
+# QUIET-W-PROGRESS-NEXT: Total Discovered Tests: 5
+# QUIET-W-PROGRESS-NEXT:   Failed             : 1 (20.00%)
+# QUIET-W-PROGRESS-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --show-all --terse-summary %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix ALL-TERSE < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# ALL-TERSE: -- Testing: 5 tests, 1 workers --
+# ALL-TERSE-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# ALL-TERSE-NEXT: ******************** TEST 'verbosity :: fail.txt' FAILED ********************
+# ALL-TERSE-NEXT: Exit Code: 127
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 1
+# ALL-TERSE-NEXT: echo "fail test output"
+# ALL-TERSE-NEXT: # executed command: echo 'fail test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | fail test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-NEXT: # {{R}}UN: at line 2
+# ALL-TERSE-NEXT: fail
+# ALL-TERSE-NEXT: # executed command: fail
+# ALL-TERSE-NEXT: # .---command stderr------------
+# ALL-TERSE-NEXT: # | 'fail': command not found
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-NEXT: # error: command failed with exit status: 127
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# ALL-TERSE-NEXT: Exit Code: 0
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 1
+# ALL-TERSE-NEXT: echo "pass test output"
+# ALL-TERSE-NEXT: # executed command: echo 'pass test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | pass test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# ALL-TERSE-NEXT: Test requires the following unavailable features: asdf
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# ALL-TERSE-NEXT: Exit Code: 1
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 2
+# ALL-TERSE-NEXT: not echo "xfail test output"
+# ALL-TERSE-NEXT: # executed command: not echo 'xfail test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | xfail test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-NEXT: # error: command failed with exit status: 1
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# ALL-TERSE-NEXT: ******************** TEST 'verbosity :: xpass.txt' FAILED ********************
+# ALL-TERSE-NEXT: Exit Code: 0
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Command Output (stdout):
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-NEXT: # {{R}}UN: at line 2
+# ALL-TERSE-NEXT: echo "xpass test output"
+# ALL-TERSE-NEXT: # executed command: echo 'xpass test output'
+# ALL-TERSE-NEXT: # .---command stdout------------
+# ALL-TERSE-NEXT: # | xpass test output
+# ALL-TERSE-NEXT: # `-----------------------------
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: --
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: Failed Tests (1):
+# ALL-TERSE-NEXT:   verbosity :: fail.txt
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: ********************
+# ALL-TERSE-NEXT: Unexpectedly Passed Tests (1):
+# ALL-TERSE-NEXT:   verbosity :: xpass.txt
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-EMPTY:
+# ALL-TERSE-NEXT: Total Discovered Tests: 5
+# ALL-TERSE-NEXT:   Failed             : 1 (20.00%)
+# ALL-TERSE-NEXT:   Unexpectedly Passed: 1 (20.00%)
+
+# RUN: not %{lit} --show-all --diagnostic-level error %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SHOW-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix QUIET-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --show-all --test-output off %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# RUN: not %{lit} --succinct --print-result-after all %{inputs}/verbosity 2> %t/stderr.txt > %t/stdout.txt
+# RUN: FileCheck %s --check-prefix SUCCINCT-RESULT-ALL < %t/stdout.txt
+# RUN: FileCheck %s --check-prefix NO-ARGS-ERR --implicit-check-not lit < %t/stderr.txt
+
+# SUCCINCT-RESULT-ALL:      -- Testing: 5 tests, 1 workers --
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: FAIL: verbosity :: fail.txt (1 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: PASS: verbosity :: pass.txt (2 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: {{UN}}SUPPORTED: verbosity :: unsupported.txt (3 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: {{X}}FAIL: verbosity :: xfail.txt (4 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: XPASS: verbosity :: xpass.txt (5 of 5)
+# SUCCINCT-RESULT-ALL-NEXT: Testing:
+# SUCCINCT-RESULT-ALL-NEXT: ********************
+# SUCCINCT-RESULT-ALL-NEXT: Failed Tests (1):
+# SUCCINCT-RESULT-ALL-NEXT:   verbosity :: fail.txt
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-NEXT: ********************
+# SUCCINCT-RESULT-ALL-NEXT: Unexpectedly Passed Tests (1):
+# SUCCINCT-RESULT-ALL-NEXT:   verbosity :: xpass.txt
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-NEXT: Testing Time: {{.*}}s
+# SUCCINCT-RESULT-ALL-EMPTY:
+# SUCCINCT-RESULT-ALL-NEXT: Total Discovered Tests: 5
+# SUCCINCT-RESULT-ALL-NEXT:   Unsupported        : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Passed             : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Expectedly Failed  : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Failed             : 1 (20.00%)
+# SUCCINCT-RESULT-ALL-NEXT:   Unexpectedly Passed: 1 (20.00%)
diff --git a/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
new file mode 100644
index 0000000000000..64a42a228199e
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h
@@ -0,0 +1,21 @@
+//===- ArithToAPFloat.h - Arith to APFloat impl conversion ---*- C++ ----*-===//
+//
+// Part of the APFloat Project, under the Apache License v2.0 with APFloat
+// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH APFloat-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+#define MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOAPFLOAT_ARITHTOAPFLOAT_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index 40d866ec7bf10..82bdfd02661a6 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,6 +12,7 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
 #include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToEmitC/ArithToEmitCPass.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index e0cac8b699c30..d5665b439b059 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -186,6 +186,21 @@ def ArithToLLVMConversionPass : Pass<"convert-arith-to-llvm"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToAPFloat
+//===----------------------------------------------------------------------===//
+
+def ArithToAPFloatConversionPass
+    : Pass<"convert-arith-to-apfloat", "ModuleOp"> {
+  let summary = "Convert Arith ops to APFloat runtime library calls";
+  let description = [{
+    This pass converts supported Arith ops to APFloat-based runtime library
+    calls (APFloatWrappers.cpp). APFloat is a software implementation of
+    floating-point arithmetic operations.
+  }];
+  let dependentDialects = ["func::FuncDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithToSPIRV
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Func/Utils/Utils.h b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
index 3576126a487ac..00d50874a2e8d 100644
--- a/mlir/include/mlir/Dialect/Func/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Func/Utils/Utils.h
@@ -60,6 +60,13 @@ mlir::FailureOr<std::pair<mlir::func::FuncOp, mlir::func::CallOp>>
 deduplicateArgsOfFuncOp(mlir::RewriterBase &rewriter, mlir::func::FuncOp funcOp,
                         mlir::ModuleOp moduleOp);
 
+/// Look up a FuncOp with signature `resultTypes`(`paramTypes`)` and name
+/// `name`. Return a failure if the FuncOp is found but with a different
+/// signature.
+FailureOr<FuncOp> lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                               FunctionType funcT,
+                               SymbolTableCollection *symbolTables = nullptr);
+
 } // namespace func
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 8ad9ed18acebd..b09d32022e348 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,6 +52,10 @@ lookupOrCreatePrintF32Fn(OpBuilder &b, Operation *moduleOp,
 FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
                          SymbolTableCollection *symbolTables = nullptr);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                             SymbolTableCollection *symbolTables = nullptr);
+
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d11d196207b51..4c13c5ddb2886 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -921,6 +921,23 @@ def NVVM_Barrier0Op : NVVM_Op<"barrier0"> {
   }];
 }
 
+// Attrs describing the reduction operations for the barrier operation.
+def BarrierReductionPopc : I32EnumAttrCase<"POPC", 0, "popc">;
+def BarrierReductionAnd : I32EnumAttrCase<"AND", 1, "and">;
+def BarrierReductionOr : I32EnumAttrCase<"OR", 2, "or">;
+
+def BarrierReduction
+    : I32EnumAttr<"BarrierReduction", "NVVM barrier reduction operation",
+                  [BarrierReductionPopc, BarrierReductionAnd,
+                   BarrierReductionOr]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def BarrierReductionAttr
+    : EnumAttr<NVVM_Dialect, BarrierReduction, "reduction"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
   let summary = "CTA Barrier Synchronization Op";
   let description = [{
@@ -935,6 +952,9 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     - `numberOfThreads`: Specifies the number of threads participating in the barrier. 
       When specified, the value must be a multiple of the warp size. If not specified, 
       all threads in the CTA participate in the barrier.
+    - `reductionOp`: specifies the reduction operation (`popc`, `and`, `or`).
+    - `reductionPredicate`: specifies the predicate to be used with the
+      `reductionOp`. 
 
     The barrier operation guarantees that when the barrier completes, prior memory 
     accesses requested by participating threads are performed relative to all threads 
@@ -951,31 +971,37 @@ def NVVM_BarrierOp : NVVM_Op<"barrier", [AttrSizedOperandSegments]> {
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar)
   }];
 
-  let arguments = (ins     
-    Optional<I32>:$barrierId,
-    Optional<I32>:$numberOfThreads);
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+      getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                            llvm::IRBuilderBase& builder);
+  }];
+
+  let arguments = (ins Optional<I32>:$barrierId, Optional<I32>:$numberOfThreads,
+      OptionalAttr<BarrierReductionAttr>:$reductionOp,
+      Optional<I32>:$reductionPredicate);
   string llvmBuilder = [{
-    llvm::Value *id = $barrierId ? $barrierId : builder.getInt32(0);
-    if ($numberOfThreads)
-      createIntrinsicCall(
-          builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count,
-          {id, $numberOfThreads});
-    else
-      createIntrinsicCall(
-          builder, llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, {id});
+    auto [id, args] = NVVM::BarrierOp::getIntrinsicIDAndArgs(
+                        *op, moduleTranslation, builder);
+    if ($reductionOp)
+      $res = createIntrinsicCall(builder, id, args);
+    else 
+      createIntrinsicCall(builder, id, args);
   }];
+  let results = (outs Optional<I32>:$res);
+
   let hasVerifier = 1;
 
-  let assemblyFormat = "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? attr-dict";
+  let assemblyFormat =
+      "(`id` `=` $barrierId^)? (`number_of_threads` `=` $numberOfThreads^)? "
+      "($reductionOp^ $reductionPredicate)? (`->` type($res)^)? attr-dict";
 
-  let builders = [
-    OpBuilder<(ins), [{
-      return build($_builder, $_state, Value{}, Value{});
+  let builders = [OpBuilder<(ins), [{
+      return build($_builder, $_state, TypeRange{}, Value{}, Value{}, {}, Value{});
     }]>,
-    OpBuilder<(ins "Value":$barrierId), [{
-      return build($_builder, $_state, barrierId, Value{});
-    }]>
-  ];
+                  OpBuilder<(ins "Value":$barrierId), [{
+      return build($_builder, $_state, TypeRange{}, barrierId, Value{}, {}, Value{});
+    }]>];
 }
 
 def NVVM_BarrierArriveOp : NVVM_PTXBuilder_Op<"barrier.arrive"> 
diff --git a/mlir/include/mlir/Reducer/ReductionPatternInterface.h b/mlir/include/mlir/Reducer/ReductionPatternInterface.h
index a85562fda4d93..a33877dc0bd77 100644
--- a/mlir/include/mlir/Reducer/ReductionPatternInterface.h
+++ b/mlir/include/mlir/Reducer/ReductionPatternInterface.h
@@ -10,6 +10,7 @@
 #define MLIR_REDUCER_REDUCTIONPATTERNINTERFACE_H
 
 #include "mlir/IR/DialectInterface.h"
+#include "mlir/Reducer/Tester.h"
 
 namespace mlir {
 
@@ -47,10 +48,17 @@ class DialectReductionPatternInterface
   /// replacing an operation with a constant.
   virtual void populateReductionPatterns(RewritePatternSet &patterns) const = 0;
 
+  /// This method extends `populateReductionPatterns` by allowing reduction
+  /// patterns to use a `Tester` instance. Some reduction patterns may need to
+  /// run tester to determine whether certain transformations preserve the
+  /// "interesting" behavior of the program. This is mostly useful when pattern
+  /// should choose between multiple modifications.
+  virtual void populateReductionPatternsWithTester(RewritePatternSet &patterns,
+                                                   Tester &tester) const {}
+
 protected:
   DialectReductionPatternInterface(Dialect *dialect) : Base(dialect) {}
 };
-
 } // namespace mlir
 
 #endif // MLIR_REDUCER_REDUCTIONPATTERNINTERFACE_H
diff --git a/mlir/include/mlir/Reducer/Tester.h b/mlir/include/mlir/Reducer/Tester.h
index eb44afc7c1c15..bed4408342034 100644
--- a/mlir/include/mlir/Reducer/Tester.h
+++ b/mlir/include/mlir/Reducer/Tester.h
@@ -36,6 +36,9 @@ class Tester {
     Untested,
   };
 
+  Tester() = default;
+  Tester(const Tester &) = default;
+
   Tester(StringRef testScript, ArrayRef<std::string> testScriptArgs);
 
   /// Runs the interestingness testing script on a MLIR test case file. Returns
@@ -46,6 +49,9 @@ class Tester {
   /// Return whether the file in the given path is interesting.
   Interestingness isInteresting(StringRef testCase) const;
 
+  void setTestScript(StringRef script) { testScript = script; }
+  void setTestScriptArgs(ArrayRef<std::string> args) { testScriptArgs = args; }
+
 private:
   StringRef testScript;
   ArrayRef<std::string> testScriptArgs;
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
new file mode 100644
index 0000000000000..699edb188a70a
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -0,0 +1,163 @@
+//===- ArithToAPFloat.cpp - Arithmetic to APFloat Conversion --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToAPFloat/ArithToAPFloat.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Utils/Utils.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_ARITHTOAPFLOATCONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::func;
+
+static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
+                           StringRef name, FunctionType funcT, bool setPrivate,
+                           SymbolTableCollection *symbolTables = nullptr) {
+  OpBuilder::InsertionGuard g(b);
+  assert(!symTable->getRegion(0).empty() && "expected non-empty region");
+  b.setInsertionPointToStart(&symTable->getRegion(0).front());
+  FuncOp funcOp = FuncOp::create(b, symTable->getLoc(), name, funcT);
+  if (setPrivate)
+    funcOp.setPrivate();
+  if (symbolTables) {
+    SymbolTable &symbolTable = symbolTables->getSymbolTable(symTable);
+    symbolTable.insert(funcOp, symTable->getRegion(0).front().begin());
+  }
+  return funcOp;
+}
+
+/// Helper function to look up or create the symbol for a runtime library
+/// function for a binary arithmetic operation.
+///
+/// Parameter 1: APFloat semantics
+/// Parameter 2: Left-hand side operand
+/// Parameter 3: Right-hand side operand
+///
+/// This function will return a failure if the function is found but has an
+/// unexpected signature.
+///
+static FailureOr<FuncOp>
+lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
+                       SymbolTableCollection *symbolTables = nullptr) {
+  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+  auto i64Type = IntegerType::get(symTable->getContext(), 64);
+
+  std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
+  FunctionType funcT =
+      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
+  FailureOr<FuncOp> func =
+      lookupFnDecl(symTable, funcName, funcT, symbolTables);
+  // Failed due to type mismatch.
+  if (failed(func))
+    return func;
+  // Successfully matched existing decl.
+  if (*func)
+    return *func;
+
+  return createFnDecl(b, symTable, funcName, funcT,
+                      /*setPrivate=*/true, symbolTables);
+}
+
+/// Rewrite a binary arithmetic operation to an APFloat function call.
+template <typename OpTy>
+struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
+  BinaryArithOpToAPFloatConversion(MLIRContext *context,
+                                   const char *APFloatName,
+                                   SymbolOpInterface symTable,
+                                   PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable),
+        APFloatName(APFloatName) {};
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    FailureOr<FuncOp> fn =
+        lookupOrCreateBinaryFn(rewriter, symTable, APFloatName);
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto floatTy = cast<FloatType>(op.getType());
+    auto intWType = rewriter.getIntegerType(floatTy.getWidth());
+    auto int64Type = rewriter.getI64Type();
+    Value lhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getLhs()));
+    Value rhsBits = arith::ExtUIOp::create(
+        rewriter, loc, int64Type,
+        arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
+
+    // Call APFloat function.
+    int32_t sem =
+        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+    Value semValue = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+    SmallVector<Value> params = {semValue, lhsBits, rhsBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, intWType,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(
+        op, arith::BitcastOp::create(rewriter, loc, floatTy, truncatedBits));
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+  const char *APFloatName;
+};
+
+namespace {
+struct ArithToAPFloatConversionPass final
+    : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
+  using Base::Base;
+
+  void runOnOperation() override;
+};
+
+void ArithToAPFloatConversionPass::runOnOperation() {
+  MLIRContext *context = &getContext();
+  RewritePatternSet patterns(context);
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::AddFOp>>(context, "add",
+                                                                getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::SubFOp>>(
+      context, "subtract", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::MulFOp>>(
+      context, "multiply", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::DivFOp>>(
+      context, "divide", getOperation());
+  patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
+      context, "remainder", getOperation());
+  LogicalResult result = success();
+  ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
+    if (diag.getSeverity() == DiagnosticSeverity::Error) {
+      result = failure();
+    }
+    // NB: if you don't return failure, no other diag handlers will fire (see
+    // mlir/lib/IR/Diagnostics.cpp:DiagnosticEngineImpl::emit).
+    return failure();
+  });
+  walkAndApplyPatterns(getOperation(), std::move(patterns));
+  if (failed(result))
+    return signalPassFailure();
+}
+} // namespace
diff --git a/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
new file mode 100644
index 0000000000000..b5ec49c087163
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToAPFloat/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRArithToAPFloat
+  ArithToAPFloat.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToLLVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRArithTransforms
+  MLIRFuncDialect
+  MLIRFuncUtils
+  )
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index b6099902cc337..f2bacc3399144 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/LLVMCommon/VectorPattern.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/TypeUtilities.h"
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index bebf1b8fff3f9..613dc6d242ceb 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
+add_subdirectory(ArithToAPFloat)
 add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToEmitC)
 add_subdirectory(ArithToLLVM)
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 69a317ecd101f..c747e1b59558a 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1654,6 +1654,20 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
           return failure();
         }
       }
+    } else if (auto floatTy = dyn_cast<FloatType>(printType)) {
+      // Print other floating-point types using the APFloat runtime library.
+      int32_t sem =
+          llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+      Value semValue = LLVM::ConstantOp::create(
+          rewriter, loc, rewriter.getI32Type(),
+          rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+      Value floatBits =
+          LLVM::ZExtOp::create(rewriter, loc, rewriter.getI64Type(), value);
+      printer =
+          LLVM::lookupOrCreateApFloatPrintFn(rewriter, parent, symbolTables);
+      emitCall(rewriter, loc, printer.value(),
+               ValueRange({semValue, floatBits}));
+      return success();
     } else {
       return failure();
     }
diff --git a/mlir/lib/Dialect/Func/Utils/Utils.cpp b/mlir/lib/Dialect/Func/Utils/Utils.cpp
index b4cb0932ef631..d6dfd0229963c 100644
--- a/mlir/lib/Dialect/Func/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Func/Utils/Utils.cpp
@@ -254,3 +254,28 @@ func::deduplicateArgsOfFuncOp(RewriterBase &rewriter, func::FuncOp funcOp,
 
   return std::make_pair(*newFuncOpOrFailure, newCallOp);
 }
+
+FailureOr<func::FuncOp>
+func::lookupFnDecl(SymbolOpInterface symTable, StringRef name,
+                   FunctionType funcT, SymbolTableCollection *symbolTables) {
+  FuncOp func;
+  if (symbolTables) {
+    func = symbolTables->lookupSymbolIn<FuncOp>(
+        symTable, StringAttr::get(symTable->getContext(), name));
+  } else {
+    func = llvm::dyn_cast_or_null<FuncOp>(
+        SymbolTable::lookupSymbolIn(symTable, name));
+  }
+
+  if (!func)
+    return func;
+
+  mlir::FunctionType foundFuncT = func.getFunctionType();
+  // Assert the signature of the found function is same as expected
+  if (funcT != foundFuncT) {
+    return func.emitError("matched function '")
+           << name << "' but with different type: " << foundFuncT
+           << " (expected " << funcT << ")";
+  }
+  return func;
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index feaffa34897b6..160b6ae89215c 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -30,6 +30,7 @@ static constexpr llvm::StringRef kPrintF16 = "printF16";
 static constexpr llvm::StringRef kPrintBF16 = "printBF16";
 static constexpr llvm::StringRef kPrintF32 = "printF32";
 static constexpr llvm::StringRef kPrintF64 = "printF64";
+static constexpr llvm::StringRef kPrintApFloat = "printApFloat";
 static constexpr llvm::StringRef kPrintString = "printString";
 static constexpr llvm::StringRef kPrintOpen = "printOpen";
 static constexpr llvm::StringRef kPrintClose = "printClose";
@@ -160,6 +161,16 @@ mlir::LLVM::lookupOrCreatePrintF64Fn(OpBuilder &b, Operation *moduleOp,
       LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
 }
 
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateApFloatPrintFn(OpBuilder &b, Operation *moduleOp,
+                                         SymbolTableCollection *symbolTables) {
+  return lookupOrCreateReservedFn(
+      b, moduleOp, kPrintApFloat,
+      {IntegerType::get(moduleOp->getContext(), 32),
+       IntegerType::get(moduleOp->getContext(), 64)},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()), symbolTables);
+}
+
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
   return LLVM::LLVMPointerType::get(context);
 }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index e0c25ab6cdef7..0f7b3638fb30d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1517,6 +1517,15 @@ LogicalResult NVVM::BarrierOp::verify() {
   if (getNumberOfThreads() && !getBarrierId())
     return emitOpError(
         "barrier id is missing, it should be set between 0 to 15");
+
+  if (getBarrierId() && (getReductionOp() || getReductionPredicate()))
+    return emitOpError("reduction are only available when id is 0");
+
+  if ((getReductionOp() && !getReductionPredicate()) ||
+      (!getReductionOp() && getReductionPredicate()))
+    return emitOpError("reduction predicate and reduction operation must be "
+                       "specified together");
+
   return success();
 }
 
@@ -1785,6 +1794,39 @@ std::string NVVM::MBarrierTryWaitParityOp::getPtx() {
 // getIntrinsicID/getIntrinsicIDAndArgs methods
 //===----------------------------------------------------------------------===//
 
+mlir::NVVM::IDArgPair NVVM::BarrierOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::BarrierOp>(op);
+  llvm::Value *barrierId = thisOp.getBarrierId()
+                               ? mt.lookupValue(thisOp.getBarrierId())
+                               : builder.getInt32(0);
+  llvm::Intrinsic::ID id;
+  llvm::SmallVector<llvm::Value *> args;
+  if (thisOp.getNumberOfThreads()) {
+    id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_count;
+    args.push_back(barrierId);
+    args.push_back(mt.lookupValue(thisOp.getNumberOfThreads()));
+  } else if (thisOp.getReductionOp()) {
+    switch (*thisOp.getReductionOp()) {
+    case NVVM::BarrierReduction::AND:
+      id = llvm::Intrinsic::nvvm_barrier0_and;
+      break;
+    case NVVM::BarrierReduction::OR:
+      id = llvm::Intrinsic::nvvm_barrier0_or;
+      break;
+    case NVVM::BarrierReduction::POPC:
+      id = llvm::Intrinsic::nvvm_barrier0_popc;
+      break;
+    }
+    args.push_back(mt.lookupValue(thisOp.getReductionPredicate()));
+  } else {
+    id = llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all;
+    args.push_back(barrierId);
+  }
+
+  return {id, std::move(args)};
+}
+
 mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::MBarrierInitOp>(op);
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
new file mode 100644
index 0000000000000..0a05f7369e556
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -0,0 +1,89 @@
+//===- APFloatWrappers.cpp - Software Implementation of FP Arithmetics --- ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the APFloat infrastructure to MLIR programs as a runtime
+// library. APFloat is a software implementation of floating point arithmetics.
+//
+// On the MLIR side, floating-point values must be bitcasted to 64-bit integers
+// before calling a runtime function. If a floating-point type has less than
+// 64 bits, it must be zero-extended to 64 bits after bitcasting it to an
+// integer.
+//
+// Runtime functions receive the floating-point operands of the arithmeic
+// operation in the form of 64-bit integers, along with the APFloat semantics
+// in the form of a 32-bit integer, which will be interpreted as an
+// APFloatBase::Semantics enum value.
+//
+#include "llvm/ADT/APFloat.h"
+
+#ifdef _WIN32
+#ifndef MLIR_APFLOAT_WRAPPERS_EXPORT
+#ifdef mlir_apfloat_wrappers_EXPORTS
+// We are building this library
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllexport)
+#else
+// We are using this library
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __declspec(dllimport)
+#endif // mlir_apfloat_wrappers_EXPORTS
+#endif // MLIR_APFLOAT_WRAPPERS_EXPORT
+#else
+// Non-windows: use visibility attributes.
+#define MLIR_APFLOAT_WRAPPERS_EXPORT __attribute__((visibility("default")))
+#endif // _WIN32
+
+/// Binary operations without rounding mode.
+#define APFLOAT_BINARY_OP(OP)                                                  \
+  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs);                                                               \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+/// Binary operations with rounding mode.
+#define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
+  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+      int32_t semantics, uint64_t a, uint64_t b) {                             \
+    const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
+        static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
+    unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);           \
+    llvm::APFloat lhs(sem, llvm::APInt(bitWidth, a));                          \
+    llvm::APFloat rhs(sem, llvm::APInt(bitWidth, b));                          \
+    lhs.OP(rhs, ROUNDING_MODE);                                                \
+    return lhs.bitcastToAPInt().getZExtValue();                                \
+  }
+
+extern "C" {
+
+#define BIN_OPS_WITH_ROUNDING(X)                                               \
+  X(add, llvm::RoundingMode::NearestTiesToEven)                                \
+  X(subtract, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(multiply, llvm::RoundingMode::NearestTiesToEven)                           \
+  X(divide, llvm::RoundingMode::NearestTiesToEven)
+
+BIN_OPS_WITH_ROUNDING(APFLOAT_BINARY_OP_ROUNDING_MODE)
+#undef BIN_OPS_WITH_ROUNDING
+#undef APFLOAT_BINARY_OP_ROUNDING_MODE
+
+APFLOAT_BINARY_OP(remainder)
+
+#undef APFLOAT_BINARY_OP
+
+MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned bitWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat x(sem, llvm::APInt(bitWidth, a));
+  double d = x.convertToDouble();
+  fprintf(stdout, "%lg", d);
+}
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index fdeb4dacf9278..0045675bcb448 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,6 +2,7 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
+  APFloatWrappers.cpp
   ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
@@ -167,6 +168,20 @@ if(LLVM_ENABLE_PIC)
   set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
   target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
 
+  add_mlir_library(mlir_apfloat_wrappers
+    SHARED
+    APFloatWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+    )
+  set_target_properties(
+    mlir_apfloat_wrappers
+    PROPERTIES CXX_STANDARD 17
+               CXX_VISIBILITY_PRESET hidden
+               VISIBILITY_INLINES_HIDDEN ON
+  )
+  target_compile_definitions(mlir_apfloat_wrappers PRIVATE mlir_apfloat_wrappers_EXPORTS)
+
   add_subdirectory(SparseTensor)
 
   add_mlir_library(mlir_c_runner_utils
@@ -177,6 +192,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
     MLIRSparseTensorEnums
     MLIRSparseTensorRuntime
@@ -191,6 +207,7 @@ if(LLVM_ENABLE_PIC)
     EXCLUDE_FROM_LIBMLIR
 
     LINK_LIBS PUBLIC
+    mlir_apfloat_wrappers
     mlir_float16_utils
   )
   target_compile_definitions(mlir_runner_utils PRIVATE mlir_runner_utils_EXPORTS)
diff --git a/mlir/lib/Reducer/ReductionTreePass.cpp b/mlir/lib/Reducer/ReductionTreePass.cpp
index 5b49204013cc0..1e00ed645f71e 100644
--- a/mlir/lib/Reducer/ReductionTreePass.cpp
+++ b/mlir/lib/Reducer/ReductionTreePass.cpp
@@ -175,9 +175,12 @@ class ReductionPatternInterfaceCollection
   using Base::Base;
 
   // Collect the reduce patterns defined by each dialect.
-  void populateReductionPatterns(RewritePatternSet &pattern) const {
-    for (const DialectReductionPatternInterface &interface : *this)
+  void populateReductionPatterns(RewritePatternSet &pattern,
+                                 Tester &tester) const {
+    for (const DialectReductionPatternInterface &interface : *this) {
       interface.populateReductionPatterns(pattern);
+      interface.populateReductionPatternsWithTester(pattern, tester);
+    }
   }
 };
 
@@ -201,15 +204,21 @@ class ReductionTreePass
 private:
   LogicalResult reduceOp(ModuleOp module, Region &region);
 
+  Tester tester;
   FrozenRewritePatternSet reducerPatterns;
 };
 
 } // namespace
 
 LogicalResult ReductionTreePass::initialize(MLIRContext *context) {
+  tester.setTestScript(testerName);
+  tester.setTestScriptArgs(testerArgs);
+
   RewritePatternSet patterns(context);
+
   ReductionPatternInterfaceCollection reducePatternCollection(context);
-  reducePatternCollection.populateReductionPatterns(patterns);
+  reducePatternCollection.populateReductionPatterns(patterns, tester);
+
   reducerPatterns = std::move(patterns);
   return success();
 }
@@ -244,11 +253,10 @@ void ReductionTreePass::runOnOperation() {
 }
 
 LogicalResult ReductionTreePass::reduceOp(ModuleOp module, Region &region) {
-  Tester test(testerName, testerArgs);
   switch (traversalModeId) {
   case TraversalMode::SinglePath:
     return findOptimal<ReductionNode::iterator<TraversalMode::SinglePath>>(
-        module, region, reducerPatterns, test);
+        module, region, reducerPatterns, tester);
   default:
     return module.emitError() << "unsupported traversal mode detected";
   }
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
new file mode 100644
index 0000000000000..797f42c37a26f
--- /dev/null
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-opt %s --convert-arith-to-apfloat -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL:   func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
+
+// CHECK-LABEL:   func.func @foo() -> f8E4M3FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 2.250000e+00 : f8E4M3FN
+// CHECK:           return %[[CONSTANT_0]] : f8E4M3FN
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @bar() -> f6E3M2FN {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 3.000000e+00 : f6E3M2FN
+// CHECK:           return %[[CONSTANT_0]] : f6E3M2FN
+// CHECK:         }
+
+// Illustrate that both f8E4M3FN and f6E3M2FN calling the same _mlir_apfloat_add is fine
+// because each gets its own semantics enum and gets bitcast/extui/trunci to its own width.
+// CHECK-LABEL:   func.func @full_example() {
+// CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1.375000e+00 : f8E4M3FN
+// CHECK:           %[[VAL_0:.*]] = call @foo() : () -> f8E4M3FN
+// CHECK:           %[[BITCAST_0:.*]] = arith.bitcast %[[CONSTANT_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_0:.*]] = arith.extui %[[BITCAST_0]] : i8 to i64
+// CHECK:           %[[BITCAST_1:.*]] = arith.bitcast %[[VAL_0]] : f8E4M3FN to i8
+// CHECK:           %[[EXTUI_1:.*]] = arith.extui %[[BITCAST_1]] : i8 to i64
+//                  // fltSemantics semantics for f8E4M3FN
+// CHECK:           %[[CONSTANT_1:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_1:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_1]], %[[EXTUI_0]], %[[EXTUI_1]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_0:.*]] = arith.trunci %[[VAL_1]] : i64 to i8
+// CHECK:           %[[BITCAST_2:.*]] = arith.bitcast %[[TRUNCI_0]] : i8 to f8E4M3FN
+// CHECK:           vector.print %[[BITCAST_2]] : f8E4M3FN
+
+// CHECK:           %[[CONSTANT_2:.*]] = arith.constant 2.500000e+00 : f6E3M2FN
+// CHECK:           %[[VAL_2:.*]] = call @bar() : () -> f6E3M2FN
+// CHECK:           %[[BITCAST_3:.*]] = arith.bitcast %[[CONSTANT_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_2:.*]] = arith.extui %[[BITCAST_3]] : i6 to i64
+// CHECK:           %[[BITCAST_4:.*]] = arith.bitcast %[[VAL_2]] : f6E3M2FN to i6
+// CHECK:           %[[EXTUI_3:.*]] = arith.extui %[[BITCAST_4]] : i6 to i64
+//                  // fltSemantics semantics for f6E3M2FN
+// CHECK:           %[[CONSTANT_3:.*]] = arith.constant 16 : i32
+// CHECK:           %[[VAL_3:.*]] = call @_mlir_apfloat_add(%[[CONSTANT_3]], %[[EXTUI_2]], %[[EXTUI_3]]) : (i32, i64, i64) -> i64
+// CHECK:           %[[TRUNCI_1:.*]] = arith.trunci %[[VAL_3]] : i64 to i6
+// CHECK:           %[[BITCAST_5:.*]] = arith.bitcast %[[TRUNCI_1]] : i6 to f6E3M2FN
+// CHECK:           vector.print %[[BITCAST_5]] : f6E3M2FN
+// CHECK:           return
+// CHECK:         }
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> f8E4M3FN {
+  %cst = arith.constant 2.2 : f8E4M3FN
+  return %cst : f8E4M3FN
+}
+
+func.func @bar() -> f6E3M2FN {
+  %cst = arith.constant 3.2 : f6E3M2FN
+  return %cst : f6E3M2FN
+}
+
+func.func @full_example() {
+  %a = arith.constant 1.4 : f8E4M3FN
+  %b = func.call @foo() : () -> (f8E4M3FN)
+  %c = arith.addf %a, %b : f8E4M3FN
+  vector.print %c : f8E4M3FN
+
+  %d = arith.constant 2.4 : f6E3M2FN
+  %e = func.call @bar() : () -> (f6E3M2FN)
+  %f = arith.addf %d, %e : f6E3M2FN
+  vector.print %f : f6E3M2FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_add(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_add(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// Test decl collision (different type)
+// expected-error@+1{{matched function '_mlir_apfloat_add' but with different type: '(i32, i32, f32) -> index' (expected '(i32, i64, i64) -> i64')}}
+func.func private @_mlir_apfloat_add(i32, i32, f32) -> index
+func.func @addf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.addf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_subtract(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_subtract(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.subf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_multiply(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_multiply(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.mulf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_divide(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_divide(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @subf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.divf %arg0, %arg1 : f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_remainder(i32, i64, i64) -> i64
+// CHECK: %[[sem:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_remainder(%[[sem]], %{{.*}}, %{{.*}}) : (i32, i64, i64) -> i64
+func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
+  %0 = arith.remf %arg0, %arg1 : f4E2M1FN
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
new file mode 100644
index 0000000000000..2768afe0834b5
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -0,0 +1,36 @@
+// Case 1: All floating-point arithmetics is lowered through APFloat.
+// RUN: mlir-opt %s --convert-arith-to-apfloat --convert-to-llvm | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils \
+// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
+
+// Case 2: Only unsupported arithmetics (f8E4M3FN) is lowered through APFloat.
+//         Arithmetics on f32 is lowered directly to LLVM.
+// RUN: mlir-opt %s --convert-to-llvm --convert-arith-to-apfloat \
+// RUN:          --convert-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-runner -e entry --entry-point-result=void \
+// RUN:             --shared-libs=%mlir_c_runner_utils \
+// RUN:             --shared-libs=%mlir_apfloat_wrappers | FileCheck %s
+
+// Put rhs into separate function so that it won't be constant-folded.
+func.func @foo() -> (f8E4M3FN, f32) {
+  %cst1 = arith.constant 2.2 : f8E4M3FN
+  %cst2 = arith.constant 2.2 : f32
+  return %cst1, %cst2 : f8E4M3FN, f32
+}
+
+func.func @entry() {
+  %a1 = arith.constant 1.4 : f8E4M3FN
+  %a2 = arith.constant 1.4 : f32
+  %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
+  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
+  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
+
+  // CHECK: 3.5
+  vector.print %c1 : f8E4M3FN
+
+  // CHECK: 3.6
+  vector.print %c2 : f32
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
index 127ab70cb4539..610ed63168d87 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
@@ -24,17 +24,14 @@ func.func @main() {
   %d5x = tensor.cast %c5x : tensor<5xf32> to tensor<?xf32>
   %d4x = tensor.cast %c4x : tensor<4xf32> to tensor<?xf32>
 
-  // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @simple_add(%d5x, %d5x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
-
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
   func.call @simple_add(%d5x, %d4x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
 
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
   func.call @simple_add(%d4x, %d5x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
 
   %c1x1 = arith.constant dense<0.0> : tensor<1x1xf32>
@@ -48,71 +45,81 @@ func.func @main() {
   %d4x5 = tensor.cast %c4x5 : tensor<4x5xf32> to tensor<?x?xf32>
   %d5x4 = tensor.cast %c5x4 : tensor<5x4xf32> to tensor<?x?xf32>
 
-  // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @broadcast_add(%d1x1, %d1x1) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
-
-  // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @broadcast_add(%d1x1, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
-
-  // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @broadcast_add(%d4x4, %d1x4) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+  // CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #1 of input/output operand #1 is incompatible with inferred dimension size
 
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #1 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #1 of input/output operand #2 is incompatible with inferred dimension size
   func.call @broadcast_add(%d1x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size 
+
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #1 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #1 of input/output operand #1 is incompatible with inferred dimension size
+
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #1 of input/output operand #2 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #1 of input/output operand #2 is incompatible with inferred dimension size
   func.call @broadcast_add(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
-  // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @matmul_generic(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
-
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.generic
+  // CHECK-NEXT: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
   func.call @matmul_generic(%d4x5, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
-  // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @matmul_named(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
-
   // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.matmul
-  // CHECK: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
+  // CHECK-NEXT: linalg.matmul
+  // CHECK-NEXT: ^ dimension #0 of input/output operand #1 is incompatible with inferred dimension size
   func.call @matmul_named(%d4x5, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
   %c64x57 = arith.constant dense<0.0> : tensor<16x29xf32>
   %c3x4 = arith.constant dense<0.0> : tensor<3x4xf32>
 
+  // TODO: BROKEN CHK: ERROR: Runtime op verification failed
+  // TODO: BROKEN CHK-NEXT: linalg.generic
+  // TODO: BROKEN CHK-NEXT: unexpected negative result on dimension #0 of input/output operand #0
+  // TODO: BROKEN func.call @reverse_from_3(%d5x) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+  %c0x = arith.constant dense<1.0> : tensor<0xf32>
+  %d0x = tensor.cast %c0x : tensor<0xf32> to tensor<?xf32>
+
+  %c0x5 = arith.constant dense<0.0> : tensor<0x5xf32>
+  %d0x5 = tensor.cast %c0x5 : tensor<0x5xf32> to tensor<?x?xf32>
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @fill_empty_1d(%d0x) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @simple_add(%d5x, %d5x) : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @fill_empty_2d(%d0x5) : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
   // CHECK-NOT: ERROR: Runtime op verification failed
   func.call @conv(%c64x57, %c3x4) : (tensor<16x29xf32>, tensor<3x4xf32>) -> (tensor<5x7xf32>)
 
   // CHECK-NOT: ERROR: Runtime op verification failed
   func.call @reverse_from_3(%d4x) : (tensor<?xf32>) -> (tensor<?xf32>)
 
-  // CHECK: ERROR: Runtime op verification failed
-  // CHECK: linalg.generic
-  // CHECK: unexpected negative result on dimension #0 of input/output operand #0
-  func.call @reverse_from_3(%d5x) : (tensor<?xf32>) -> (tensor<?xf32>)
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @matmul_named(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
-  %c0x = arith.constant dense<1.0> : tensor<0xf32>
-  %d0x = tensor.cast %c0x : tensor<0xf32> to tensor<?xf32>
   // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @fill_empty_1d(%d0x) : (tensor<?xf32>) -> (tensor<?xf32>)
+  func.call @matmul_generic(%d5x4, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
-  %c0x5 = arith.constant dense<0.0> : tensor<0x5xf32>
-  %d0x5 = tensor.cast %c0x5 : tensor<0x5xf32> to tensor<?x?xf32>
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @broadcast_add(%d1x1, %d1x1) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
   // CHECK-NOT: ERROR: Runtime op verification failed
-  func.call @fill_empty_2d(%d0x5) : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
+  func.call @broadcast_add(%d1x1, %d4x5) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @broadcast_add(%d4x4, %d1x4) : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
 
   return
 }
diff --git a/mlir/test/Target/LLVMIR/nvvm/barrier.mlir b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
new file mode 100644
index 0000000000000..d89f93101c1fc
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/barrier.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-translate -mlir-to-llvmir %s  -split-input-file --verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @llvm_nvvm_barrier(
+// CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]], i32 %[[redOperand:.*]])
+llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32, %redOperand : i32) {
+  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+  nvvm.barrier
+  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %[[barId]])
+  nvvm.barrier id = %barID
+  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %[[barId]], i32 %[[numThreads]])
+  nvvm.barrier id = %barID number_of_threads = %numberOfThreads
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.barrier0.and(i32 %[[redOperand]])
+  %0 = nvvm.barrier #nvvm.reduction<and> %redOperand -> i32
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.barrier0.or(i32 %[[redOperand]])
+  %1 = nvvm.barrier #nvvm.reduction<or> %redOperand -> i32
+  // CHECK: %{{.*}} = call i32 @llvm.nvvm.barrier0.popc(i32 %[[redOperand]])
+  %2 = nvvm.barrier #nvvm.reduction<popc> %redOperand -> i32
+
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index fec54cbf5e3e5..5cba5c4fceefd 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -166,25 +166,6 @@ llvm.func @nvvm_rcp(%0: f32) -> f32 {
   llvm.return %1 : f32
 }
 
-// CHECK-LABEL: @llvm_nvvm_barrier0
-llvm.func @llvm_nvvm_barrier0() {
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
-  nvvm.barrier0
-  llvm.return
-}
-
-// CHECK-LABEL: @llvm_nvvm_barrier(
-// CHECK-SAME: i32 %[[barId:.*]], i32 %[[numThreads:.*]])
-llvm.func @llvm_nvvm_barrier(%barID : i32, %numberOfThreads : i32) {
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
-  nvvm.barrier
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %[[barId]])
-  nvvm.barrier id = %barID
-  // CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %[[barId]], i32 %[[numThreads]])
-  nvvm.barrier id = %barID number_of_threads = %numberOfThreads
-  llvm.return
-}
-
 // CHECK-LABEL: @llvm_nvvm_cluster_arrive
 llvm.func @llvm_nvvm_cluster_arrive() {
   // CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 6ff12d66523f5..4a38ed605be0c 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -208,6 +208,7 @@ def find_real_python_interpreter():
     add_runtime("mlir_c_runner_utils"),
     add_runtime("mlir_async_runtime"),
     add_runtime("mlir_float16_utils"),
+    add_runtime("mlir_apfloat_wrappers"),
     "mlir-linalg-ods-yaml-gen",
     "mlir-reduce",
     "mlir-pdll",