diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index e1bc59f389b36..269f75cace266 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -39,6 +39,7 @@ def get_comment(
 ) -> dict[str, str]:
     repo = github.Github(github_token).get_repo("llvm/llvm-project")
     pr = repo.get_issue(pr_number).as_pull_request()
+    body = COMMENT_TAG.format(platform=platform.system()) + "\n" + body
     comment = {"body": body}
     comment_id = get_comment_id(platform.system(), pr)
     if comment_id:
@@ -128,7 +129,7 @@ def main(
                 ),
             )
         ]
-        with open("comment", "w") as comment_file_handle:
+        with open("comments", "w") as comment_file_handle:
             json.dump(comments, comment_file_handle)
     else:
         print(advisor_response.reason)
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 3a048aeb9405b..01fd895cce7e8 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -48,6 +48,42 @@ jobs:
             cpp_compiler: clang++-22
             target: x86_64-unknown-uefi-llvm
             include_scudo: OFF
+          - os: ubuntu-24.04
+            build_type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv6m-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build_type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv7m-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build_type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv7em-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build_type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv8m.main-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build_type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv8.1m.main-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build_type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: riscv32-unknown-elf
+            include_scudo: OFF
           # TODO: add back gcc build when it is fixed
           # - c_compiler: gcc
           #   cpp_compiler: g++
@@ -93,28 +129,39 @@ jobs:
       run: |
         export RUNTIMES="libc"
 
+        export CMAKE_FLAGS="
+          -G Ninja
+          -S ${{ github.workspace }}/runtimes
+          -B ${{ steps.strings.outputs.build-output-dir }}
+          -DCMAKE_ASM_COMPILER=${{ matrix.c_compiler }}
+          -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+          -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DCMAKE_C_COMPILER_LAUNCHER=sccache
+          -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+          -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }}"
+
         if [[ ${{ matrix.include_scudo}} == "ON" ]]; then
           export RUNTIMES="$RUNTIMES;compiler-rt"
-          export CMAKE_FLAGS="
+          export CMAKE_FLAGS="$CMAKE_FLAGS
             -DLLVM_LIBC_INCLUDE_SCUDO=ON
             -DCOMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC=ON
             -DCOMPILER_RT_BUILD_GWP_ASAN=OFF
             -DCOMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED=OFF"
         fi
 
-        cmake -B ${{ steps.strings.outputs.build-output-dir }} \
-        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} \
-        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} \
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-        -DCMAKE_C_COMPILER_LAUNCHER=sccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-        -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }} \
-        -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
-        -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
-        -DLLVM_LIBC_FULL_BUILD=ON \
-        -G Ninja \
-        -S ${{ github.workspace }}/runtimes \
-        $CMAKE_FLAGS
+        case "${{ matrix.target }}" in
+          *-none-eabi|riscv32-unknown-elf)
+            cmake $CMAKE_FLAGS \
+              -C ${{ github.workspace }}/libc/cmake/caches/${{ matrix.target }}.cmake
+            ;;
+          *)
+            cmake -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
+              -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
+              -DLLVM_LIBC_FULL_BUILD=ON \
+              $CMAKE_FLAGS
+            ;;
+        esac
 
     - name: Build
       run: >
@@ -124,8 +171,12 @@ jobs:
         --target install
 
     - name: Test
-      # Skip UEFI tests until we have testing set up.
-      if: ${{ ! endsWith(matrix.target, '-uefi-llvm') }}
+      # Skip UEFI and baremetal tests until we have testing set up.
+      if: ${{
+          !endsWith(matrix.target, '-uefi-llvm') &&
+          !endsWith(matrix.target, '-none-eabi') &&
+          matrix.target != 'riscv32-unknown-elf'
+        }}
       run: >
         cmake 
         --build ${{ steps.strings.outputs.build-output-dir }} 
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 02a6f3b868d85..daf88b5b22125 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -119,6 +119,14 @@ jobs:
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
+      - name: Upload Comment
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: ${{ always() && !startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') }}
+        continue-on-error: true
+        with:
+          name: workflow-args
+          path: |
+            comments
 
   premerge-checks-windows:
     name: Build and Test Windows
diff --git a/.gitignore b/.gitignore
index 860b8ea12abd4..a9d616286adf1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,9 +54,9 @@ autoconf/autom4te.cache
 /cmake-build*
 # Coding assistants' stuff
 /CLAUDE.md
-/.claude/
+.claude/
 /GEMINI.md
-/.gemini/
+.gemini/
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 1e0fb038b19d8..47ff9389e8028 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -1026,4 +1026,29 @@ def CIR_UnwindAttr : CIR_UnitAttr<"Unwind", "unwind"> {
   let storageType = [{ CatchUnwind }];
 }
 
+//===----------------------------------------------------------------------===//
+// CIR_BlockAddrInfoAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_BlockAddrInfoAttr : CIR_Attr<"BlockAddrInfo", "block_addr_info"> {
+  let summary = "Block Addres attribute";
+  let description = [{
+    This attribute is used to represent the address of a basic block
+    within a function. It combines the symbol reference to a function
+    with the name of a label inside that function.
+  }];
+  let parameters = (ins "mlir::FlatSymbolRefAttr":$func,
+                        "mlir::StringAttr":$label);
+
+  let assemblyFormat = "`<` $func `,` $label `>`";
+  let builders = [
+    AttrBuilder<(ins "llvm::StringRef":$func_name,
+                     "llvm::StringRef":$label_name
+                     ), [{
+      return $_get($_ctxt, mlir::FlatSymbolRefAttr::get($_ctxt, func_name),
+                   mlir::StringAttr::get($_ctxt, label_name));
+    }]>
+  ];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2124b1dc62a81..e612d6a0ba886 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3386,6 +3386,10 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
     cannot be known by the operation, and that information affects how the
     operation is lowered.
 
+    The validity of the relationship of derived and base cannot yet be verified.
+    If the target class is not a valid base class for the object, the behavior
+    is undefined.
+
     Example:
     ```c++
     struct Base { };
@@ -3399,8 +3403,6 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
     ```
   }];
 
-  // The validity of the relationship of derived and base cannot yet be
-  // verified, currently not worth adding a verifier.
   let arguments = (ins
     Arg<CIR_PointerType, "derived class pointer", [MemRead]>:$derived_addr,
     IndexAttr:$offset, UnitAttr:$assume_not_null);
@@ -3414,6 +3416,56 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DerivedClassAddrOp
+//===----------------------------------------------------------------------===//
+
+def CIR_DerivedClassAddrOp : CIR_Op<"derived_class_addr"> {
+  let summary = "Get the derived class address for a class/struct";
+  let description = [{
+    The `cir.derived_class_addr` operaration gets the address of a particular
+    derived class given a non-virtual base class pointer. The offset in bytes
+    of the base class must be passed in, similar to `cir.base_class_addr`, but
+    going into the other direction. This means lowering to a negative offset.
+
+    The operation contains a flag for whether or not the operand may be nullptr.
+    That depends on the context and cannot be known by the operation, and that
+    information affects how the operation is lowered.
+
+    The validity of the relationship of derived and base cannot yet be verified.
+    If the target class is not a valid derived class for the object, the
+    behavior is undefined.
+
+    Example:
+    ```c++
+    class A {};
+    class B : public A {};
+
+    B *getAsB(A *a) {
+      return static_cast<B*>(a);
+    }
+    ```
+
+    leads to
+    ```mlir
+      %2 = cir.load %0 : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+      %3 = cir.base_class_addr %2 : !cir.ptr<!rec_B> [0] -> !cir.ptr<!rec_A>
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<CIR_PointerType, "base class pointer", [MemRead]>:$base_addr,
+    IndexAttr:$offset, UnitAttr:$assume_not_null);
+
+  let results = (outs Res<CIR_PointerType, "">:$derived_addr);
+
+  let assemblyFormat = [{
+      $base_addr `:` qualified(type($base_addr))
+      (`nonnull` $assume_not_null^)?
+      ` ` `[` $offset `]` `->` qualified(type($derived_addr)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexCreateOp
 //===----------------------------------------------------------------------===//
@@ -4845,4 +4897,38 @@ def CIR_AtomicClearOp : CIR_Op<"atomic.clear"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// BlockAddressOp
+//===----------------------------------------------------------------------===//
+
+def CIR_BlockAddressOp : CIR_Op<"block_address", [Pure]> {
+  let summary = "Get the address of a cir.label within a function";
+  let description = [{
+    The `cir.blockaddress` operation takes a function name and a label and
+    produces a pointer value that represents the address of that cir.label
+    within the specified function.
+
+    This operation models GCC's "labels as values" extension (`&&label`), which
+    allows taking the address of a local label and using it as a computed
+    jump target (e.g., with `goto *addr;`).
+
+    Example:
+    ```mlir
+    %1 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init]
+                                                          {alignment = 8 : i64}
+    %addr = cir.block_address <@c, "label1"> : !cir.ptr<!cir.void>
+    cir.store align(8) %addr, %1 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+    cir.br ^bb1
+   ^bb1:
+    cir.label "label"
+    ```
+  }];
+
+  let arguments = (ins CIR_BlockAddrInfoAttr:$block_addr_info);
+  let results = (outs CIR_VoidPtrType:$addr);
+  let assemblyFormat = [{
+    $block_addr_info `:` qualified(type($addr)) attr-dict
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index f5a368636c43d..da155d31d4a88 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -781,9 +781,25 @@ struct LibcFunNamePrefixSuffixParser {
   }
 };
 
+// Constant fold a conditional expression 'cond ? A : B' to
+// - 'A', if 'cond' has constant true value;
+// - 'B', if 'cond' has constant false value.
+static const Expr *tryConstantFoldConditionalExpr(const Expr *E,
+                                                  const ASTContext &Ctx) {
+  // FIXME: more places can use this function
+  if (const auto *CE = dyn_cast<ConditionalOperator>(E)) {
+    bool CondEval;
+
+    if (CE->getCond()->EvaluateAsBooleanCondition(CondEval, Ctx))
+      return CondEval ? CE->getLHS() : CE->getRHS();
+  }
+  return E;
+}
+
 // A pointer type expression is known to be null-terminated, if it has the
 // form: E.c_str(), for any expression E of `std::string` type.
-static bool isNullTermPointer(const Expr *Ptr) {
+static bool isNullTermPointer(const Expr *Ptr, ASTContext &Ctx) {
+  Ptr = tryConstantFoldConditionalExpr(Ptr, Ctx);
   if (isa<clang::StringLiteral>(Ptr->IgnoreParenImpCasts()))
     return true;
   if (isa<PredefinedExpr>(Ptr->IgnoreParenImpCasts()))
@@ -874,7 +890,7 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
 
       const Expr *Arg = Call->getArg(ArgIdx);
 
-      if (isNullTermPointer(Arg))
+      if (isNullTermPointer(Arg, Ctx))
         // If Arg is a null-terminated pointer, it is safe anyway.
         return true; // continue parsing
 
@@ -922,8 +938,8 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
   // (including the format argument) is unsafe pointer.
   return llvm::any_of(
       llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()),
-      [&UnsafeArg](const Expr *Arg) -> bool {
-        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+      [&UnsafeArg, &Ctx](const Expr *Arg) -> bool {
+        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg, Ctx)) {
           UnsafeArg = Arg;
           return true;
         }
@@ -1175,7 +1191,7 @@ static bool hasUnsafePrintfStringArg(const CallExpr &Node, ASTContext &Ctx,
   // We don't really recognize this "normal" printf, the only thing we
   // can do is to require all pointers to be null-terminated:
   for (const auto *Arg : Node.arguments())
-    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg, Ctx)) {
       Result.addNode(Tag, DynTypedNode::create(*Arg));
       return true;
     }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a391d7e70ace7..5ab1d0e05cf8a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -405,6 +405,19 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return Address(baseAddr, destType, addr.getAlignment());
   }
 
+  Address createDerivedClassAddr(mlir::Location loc, Address addr,
+                                 mlir::Type destType, unsigned offset,
+                                 bool assumeNotNull) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    cir::PointerType ptrTy = getPointerTo(destType);
+    auto derivedAddr =
+        cir::DerivedClassAddrOp::create(*this, loc, ptrTy, addr.getPointer(),
+                                        mlir::APInt(64, offset), assumeNotNull);
+    return Address(derivedAddr, destType, addr.getAlignment());
+  }
+
   mlir::Value createVTTAddrPoint(mlir::Location loc, mlir::Type retTy,
                                  mlir::Value addr, uint64_t offset) {
     return cir::VTTAddrPointOp::create(*this, loc, retTy,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index a8296782ebc40..89c4696b9da94 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -1110,6 +1110,25 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase,
   }
 }
 
+Address CIRGenFunction::getAddressOfDerivedClass(
+    mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+    llvm::iterator_range<CastExpr::path_const_iterator> path,
+    bool nullCheckValue) {
+  assert(!path.empty() && "Base path should not be empty!");
+
+  QualType derivedTy = getContext().getCanonicalTagType(derived);
+  mlir::Type derivedValueTy = convertType(derivedTy);
+  CharUnits nonVirtualOffset =
+      cgm.computeNonVirtualBaseClassOffset(derived, path);
+
+  // Note that in OG, no offset (nonVirtualOffset.getQuantity() == 0) means it
+  // just gives the address back. In CIR a `cir.derived_class` is created and
+  // made into a nop later on during lowering.
+  return builder.createDerivedClassAddr(loc, baseAddr, derivedValueTy,
+                                        nonVirtualOffset.getQuantity(),
+                                        /*assumeNotNull=*/!nullCheckValue);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index d35bb0af0de14..8607558c1cf7d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1301,7 +1301,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_ToUnion:
-  case CK_BaseToDerived:
   case CK_ObjCObjectLValueCast:
   case CK_VectorSplat:
   case CK_ConstructorConversion:
@@ -1336,6 +1335,7 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
                                   lv.getAddress().getAlignment()),
                           e->getType(), lv.getBaseInfo());
   }
+
   case CK_LValueBitCast: {
     // This must be a reinterpret_cast (or c-style equivalent).
     const auto *ce = cast<ExplicitCastExpr>(e);
@@ -1387,6 +1387,22 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
     return makeAddrLValue(baseAddr, e->getType(), lv.getBaseInfo());
   }
 
+  case CK_BaseToDerived: {
+    const auto *derivedClassDecl = e->getType()->castAsCXXRecordDecl();
+    LValue lv = emitLValue(e->getSubExpr());
+
+    // Perform the base-to-derived conversion
+    Address derived = getAddressOfDerivedClass(
+        getLoc(e->getSourceRange()), lv.getAddress(), derivedClassDecl,
+        e->path(), /*NullCheckValue=*/false);
+    // C++11 [expr.static.cast]p2: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    assert(!cir::MissingFeatures::opTBAA());
+    return makeAddrLValue(derived, e->getType(), lv.getBaseInfo());
+  }
+
   case CK_ZeroToOCLOpaqueType:
     llvm_unreachable("NULL to OpenCL opaque type lvalue cast is not valid");
   }
@@ -1782,11 +1798,7 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
   const auto *fd = cast<FunctionDecl>(gd.getDecl());
 
   if (unsigned builtinID = fd->getBuiltinID()) {
-    if (fd->getAttr<AsmLabelAttr>()) {
-      cgm.errorNYI("AsmLabelAttr");
-    }
-
-    StringRef ident = fd->getName();
+    StringRef ident = cgm.getMangledName(gd);
     std::string fdInlineName = (ident + ".inline").str();
 
     bool isPredefinedLibFunction =
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index ce95607bd468d..f777562ba6309 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -168,6 +168,15 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return emitLoadOfLValue(e);
   }
 
+  mlir::Value VisitAddrLabelExpr(const AddrLabelExpr *e) {
+    auto func = cast<cir::FuncOp>(cgf.curFn);
+    auto blockInfoAttr = cir::BlockAddrInfoAttr::get(
+        &cgf.getMLIRContext(), func.getSymName(), e->getLabel()->getName());
+    return cir::BlockAddressOp::create(builder, cgf.getLoc(e->getSourceRange()),
+                                       cgf.convertType(e->getType()),
+                                       blockInfoAttr);
+  }
+
   mlir::Value VisitIntegerLiteral(const IntegerLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()),
@@ -1972,6 +1981,20 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return builder.createIntToPtr(middleVal, destCIRTy);
   }
 
+  case CK_BaseToDerived: {
+    const CXXRecordDecl *derivedClassDecl = destTy->getPointeeCXXRecordDecl();
+    assert(derivedClassDecl && "BaseToDerived arg isn't a C++ object pointer!");
+    Address base = cgf.emitPointerWithAlignment(subExpr);
+    Address derived = cgf.getAddressOfDerivedClass(
+        cgf.getLoc(ce->getSourceRange()), base, derivedClassDecl, ce->path(),
+        cgf.shouldNullCheckClassCastValue(ce));
+
+    // C++11 [expr.static.cast]p11: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    return cgf.getAsNaturalPointerTo(derived, ce->getType()->getPointeeType());
+  }
   case CK_UncheckedDerivedToBase:
   case CK_DerivedToBase: {
     // The EmitPointerWithAlignment path does this fine; just discard
@@ -1979,7 +2002,6 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return cgf.getAsNaturalPointerTo(cgf.emitPointerWithAlignment(ce),
                                      ce->getType()->getPointeeType());
   }
-
   case CK_Dynamic: {
     Address v = cgf.emitPointerWithAlignment(subExpr);
     const auto *dce = cast<CXXDynamicCastExpr>(ce);
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 2dddf26981105..b22bf2d87fc10 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -823,6 +823,11 @@ class CIRGenFunction : public CIRGenTypeCache {
       llvm::iterator_range<CastExpr::path_const_iterator> path,
       bool nullCheckValue, SourceLocation loc);
 
+  Address getAddressOfDerivedClass(
+      mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+      llvm::iterator_range<CastExpr::path_const_iterator> path,
+      bool nullCheckValue);
+
   /// Return the VTT parameter that should be passed to a base
   /// constructor/destructor with virtual bases.
   /// FIXME: VTTs are Itanium ABI-specific, so the definition should move
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 9ac5efe0e41c7..22aada882defc 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1912,22 +1912,45 @@ mlir::LogicalResult cir::FuncOp::verify() {
 
   llvm::SmallSet<llvm::StringRef, 16> labels;
   llvm::SmallSet<llvm::StringRef, 16> gotos;
-
+  llvm::SmallSet<llvm::StringRef, 16> blockAddresses;
+  bool invalidBlockAddress = false;
   getOperation()->walk([&](mlir::Operation *op) {
     if (auto lab = dyn_cast<cir::LabelOp>(op)) {
       labels.insert(lab.getLabel());
     } else if (auto goTo = dyn_cast<cir::GotoOp>(op)) {
       gotos.insert(goTo.getLabel());
+    } else if (auto blkAdd = dyn_cast<cir::BlockAddressOp>(op)) {
+      if (blkAdd.getBlockAddrInfoAttr().getFunc().getAttr() != getSymName()) {
+        // Stop the walk early, no need to continue
+        invalidBlockAddress = true;
+        return mlir::WalkResult::interrupt();
+      }
+      blockAddresses.insert(blkAdd.getBlockAddrInfoAttr().getLabel());
     }
+    return mlir::WalkResult::advance();
   });
 
+  if (invalidBlockAddress)
+    return emitOpError() << "blockaddress references a different function";
+
+  llvm::SmallSet<llvm::StringRef, 16> mismatched;
   if (!labels.empty() || !gotos.empty()) {
-    llvm::SmallSet<llvm::StringRef, 16> mismatched =
-        llvm::set_difference(gotos, labels);
+    mismatched = llvm::set_difference(gotos, labels);
 
     if (!mismatched.empty())
       return emitOpError() << "goto/label mismatch";
   }
+
+  mismatched.clear();
+
+  if (!labels.empty() || !blockAddresses.empty()) {
+    mismatched = llvm::set_difference(blockAddresses, labels);
+
+    if (!mismatched.empty())
+      return emitOpError()
+             << "expects an existing label target in the referenced function";
+  }
+
   return success();
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
index 00972b6976295..d590ccce1f540 100644
--- a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
@@ -8,6 +8,7 @@
 #include "PassDetail.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/Passes.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <memory>
 
@@ -30,17 +31,29 @@ static void process(cir::FuncOp func) {
   mlir::OpBuilder rewriter(func.getContext());
   llvm::StringMap<Block *> labels;
   llvm::SmallVector<cir::GotoOp, 4> gotos;
+  llvm::SmallSet<StringRef, 4> blockAddrLabel;
 
   func.getBody().walk([&](mlir::Operation *op) {
     if (auto lab = dyn_cast<cir::LabelOp>(op)) {
-      // Will construct a string copy inplace. Safely erase the label
       labels.try_emplace(lab.getLabel(), lab->getBlock());
-      lab.erase();
     } else if (auto goTo = dyn_cast<cir::GotoOp>(op)) {
       gotos.push_back(goTo);
+    } else if (auto blockAddr = dyn_cast<cir::BlockAddressOp>(op)) {
+      blockAddrLabel.insert(blockAddr.getBlockAddrInfo().getLabel());
     }
   });
 
+  for (auto &lab : labels) {
+    StringRef labelName = lab.getKey();
+    Block *block = lab.getValue();
+    if (!blockAddrLabel.contains(labelName)) {
+      // erase the LabelOp inside the block if safe
+      if (auto lab = dyn_cast<cir::LabelOp>(&block->front())) {
+        lab.erase();
+      }
+    }
+  }
+
   for (auto goTo : gotos) {
     mlir::OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(goTo);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index d88a4ad76f27b..d43a462a25092 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1360,6 +1360,41 @@ mlir::LogicalResult CIRToLLVMBaseClassAddrOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMDerivedClassAddrOpLowering::matchAndRewrite(
+    cir::DerivedClassAddrOp derivedClassOp, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  const mlir::Type resultType =
+      getTypeConverter()->convertType(derivedClassOp.getType());
+  mlir::Value baseAddr = adaptor.getBaseAddr();
+  // The offset is set in the operation as an unsigned value, but it must be
+  // applied as a negative offset.
+  int64_t offsetVal = -(adaptor.getOffset().getZExtValue());
+  if (offsetVal == 0) {
+    // If the offset is zero, we can just return the base address,
+    rewriter.replaceOp(derivedClassOp, baseAddr);
+    return mlir::success();
+  }
+  llvm::SmallVector<mlir::LLVM::GEPArg, 1> offset = {offsetVal};
+  mlir::Type byteType = mlir::IntegerType::get(resultType.getContext(), 8,
+                                               mlir::IntegerType::Signless);
+  if (derivedClassOp.getAssumeNotNull()) {
+    rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(
+        derivedClassOp, resultType, byteType, baseAddr, offset,
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
+  } else {
+    mlir::Location loc = derivedClassOp.getLoc();
+    mlir::Value isNull = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, baseAddr,
+        mlir::LLVM::ZeroOp::create(rewriter, loc, baseAddr.getType()));
+    mlir::Value adjusted =
+        mlir::LLVM::GEPOp::create(rewriter, loc, resultType, byteType, baseAddr,
+                                  offset, mlir::LLVM::GEPNoWrapFlags::inbounds);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::SelectOp>(derivedClassOp, isNull,
+                                                      baseAddr, adjusted);
+  }
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
     cir::ATanOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -3802,6 +3837,12 @@ mlir::LogicalResult CIRToLLVMVAArgOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMBlockAddressOpLowering::matchAndRewrite(
+    cir::BlockAddressOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return mlir::failure();
+}
+
 std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
   return std::make_unique<ConvertCIRToLLVMPass>();
 }
diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
index fa8d918248dd0..fad9c6ca7ffc5 100644
--- a/clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -338,6 +338,23 @@ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
 
+__device__ void __asan_poison_memory_region(const void *addr,
+                                            __SIZE_TYPE__ size);
+__device__ void __asan_unpoison_memory_region(const void *addr,
+                                              __SIZE_TYPE__ size);
+__device__ int __asan_address_is_poisoned(const void *addr);
+__device__ void *__asan_region_is_poisoned(void *beg, __SIZE_TYPE__ size);
+
+#if __has_feature(address_sanitizer)
+#define ASAN_POISON_MEMORY_REGION(addr, size)                                  \
+  __asan_poison_memory_region((addr), (size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size)                                \
+  __asan_unpoison_memory_region((addr), (size))
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/clang/test/CIR/CodeGen/asm-label-inline-builtins.c b/clang/test/CIR/CodeGen/asm-label-inline-builtins.c
new file mode 100644
index 0000000000000..24c9a32e7c41d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/asm-label-inline-builtins.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple x86_64 -fclangir -emit-cir -disable-llvm-passes -o %t-cir.cir %s
+// RUN: FileCheck --input-file=%t-cir.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64 -fclangir -emit-llvm -disable-llvm-passes -o %t-cir.ll %s
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o %t.ll %s
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+
+// Verifies that clang-generated *.inline carry the same name at call and callee
+// site, in spite of asm labels.
+
+typedef struct _IO_FILE FILE;
+extern FILE *stdout;
+extern int vprintf (const char *__restrict __format, __builtin_va_list __arg);
+extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
+      const char *__restrict __format, __builtin_va_list __ap);
+extern int __vprintf_chk (int __flag, const char *__restrict __format,
+     __builtin_va_list __ap);
+
+extern __typeof (vprintf) vprintf __asm ("__vprintfieee128");
+extern __typeof (__vfprintf_chk) __vfprintf_chk __asm ("__vfprintf_chkieee128");
+extern __typeof (__vprintf_chk) __vprintf_chk __asm ("__vprintf_chkieee128");
+
+extern __inline __attribute__ ((__always_inline__)) __attribute__ ((__gnu_inline__)) __attribute__ ((__artificial__)) int
+vprintf (const char *__restrict __fmt, __builtin_va_list __ap)
+{
+  return __vfprintf_chk (stdout, 2 - 1, __fmt, __ap);
+}
+
+void test(const char *fmt, __builtin_va_list ap) {
+  vprintf(fmt, ap);
+}
+
+// CIR: cir.func internal private @__vprintfieee128.inline({{.*}}) -> !s32i inline(always)
+// CIR:   cir.call @__vfprintf_chkieee128(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
+//
+// CIR: cir.func {{.*}} @test({{.*}})
+// CIR:   cir.call @__vprintfieee128.inline(%{{.*}}, %{{.*}})
+
+
+// LLVM: define internal i32 @__vprintfieee128.inline({{.*}}) #[[ALWAYS_INLINE_ATTR:.*]] {
+// LLVM:   call i32 @__vfprintf_chkieee128(ptr %{{.*}}, i32 1, ptr %{{.*}}, ptr %{{.*}})
+//
+// LLVM: define {{.*}} void @test{{.*}}
+// LLVM:   call i32 @__vprintfieee128.inline(ptr %{{.*}}, ptr %{{.*}})
+//
+// LLVM: attributes #[[ALWAYS_INLINE_ATTR]] = { alwaysinline }
+
+// Note: OGCG emits these in the opposite order, but the content is the same.
+
+
+// OGCG: define {{.*}} void @test{{.*}}
+// OGCG:   call i32 @__vprintfieee128.inline(ptr noundef %{{.*}}, ptr noundef %{{.*}})
+//
+// OGCG: define internal i32 @__vprintfieee128.inline({{.*}}) #[[ALWAYS_INLINE_ATTR:.*]] {
+// OGCG:   call i32 @__vfprintf_chkieee128(ptr noundef %{{.*}}, i32 noundef 1, ptr noundef %{{.*}}, ptr noundef %{{.*}})
+//
+// OGCG: attributes #[[ALWAYS_INLINE_ATTR]] = { alwaysinline {{.*}} }
diff --git a/clang/test/CIR/CodeGen/base-to-derived.cpp b/clang/test/CIR/CodeGen/base-to-derived.cpp
new file mode 100644
index 0000000000000..af9aa0ffd19c1
--- /dev/null
+++ b/clang/test/CIR/CodeGen/base-to-derived.cpp
@@ -0,0 +1,97 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+class A {
+    int a;
+};
+
+class B {
+    int b;
+public:
+    A *getAsA();
+};
+
+class X : public A, public B {
+    int x;
+};
+
+X *castAtoX(A *a) {
+  return static_cast<X*>(a);
+}
+
+// CIR: cir.func {{.*}} @_Z8castAtoXP1A(%[[ARG0:.*]]: !cir.ptr<!rec_A> {{.*}})
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>, ["a", init]
+// CIR:   cir.store %[[ARG0]], %[[A_ADDR]] : !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>
+// CIR:   %[[A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[A]] : !cir.ptr<!rec_A> [0] -> !cir.ptr<!rec_X>
+
+// Note: Because the offset is 0, a null check is not needed.
+
+// LLVM: define {{.*}} ptr @_Z8castAtoXP1A(ptr %[[ARG0:.*]])
+// LLVM:   %[[A_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG0]], ptr %[[A_ADDR]]
+// LLVM:   %[[X:.*]] = load ptr, ptr %[[A_ADDR]]
+
+// OGCG: define {{.*}} ptr @_Z8castAtoXP1A(ptr {{.*}} %[[ARG0:.*]])
+// OGCG:   %[[A_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[A_ADDR]]
+// OGCG:   %[[X:.*]] = load ptr, ptr %[[A_ADDR]]
+
+X *castBtoX(B *b) {
+  return static_cast<X*>(b);
+}
+
+// CIR: cir.func {{.*}} @_Z8castBtoXP1B(%[[ARG0:.*]]: !cir.ptr<!rec_B> {{.*}})
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["b", init]
+// CIR:   cir.store %[[ARG0]], %[[B_ADDR]] : !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>
+// CIR:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.ptr<!rec_B>>, !cir.ptr<!rec_B>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[B]] : !cir.ptr<!rec_B> [4] -> !cir.ptr<!rec_X>
+
+// LLVM: define {{.*}} ptr @_Z8castBtoXP1B(ptr %[[ARG0:.*]])
+// LLVM:   %[[B_ADDR:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   store ptr %[[ARG0]], ptr %[[B_ADDR]], align 8
+// LLVM:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]], align 8
+// LLVM:   %[[IS_NULL:.*]] = icmp eq ptr %[[B]], null
+// LLVM:   %[[B_NON_NULL:.*]] = getelementptr inbounds i8, ptr %[[B]], i32 -4
+// LLVM:   %[[X:.*]] = select i1 %[[IS_NULL]], ptr %[[B]], ptr %[[B_NON_NULL]]
+
+// OGCG: define {{.*}} ptr @_Z8castBtoXP1B(ptr {{.*}} %[[ARG0:.*]])
+// OGCG: entry:
+// OGCG:   %[[B_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// OGCG:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// OGCG:   %[[IS_NULL:.*]] = icmp eq ptr %[[B]], null
+// OGCG:   br i1 %[[IS_NULL]], label %[[LABEL_NULL:.*]], label %[[LABEL_NOTNULL:.*]]
+// OGCG: [[LABEL_NOTNULL]]:
+// OGCG:   %[[B_NON_NULL:.*]] = getelementptr inbounds i8, ptr %[[B]], i64 -4
+// OGCG:   br label %[[LABEL_END:.*]]
+// OGCG: [[LABEL_NULL]]:
+// OGCG:   br label %[[LABEL_END:.*]]
+// OGCG: [[LABEL_END]]:
+// OGCG:   %[[X:.*]] = phi ptr [ %[[B_NON_NULL]], %[[LABEL_NOTNULL]] ], [ null, %[[LABEL_NULL]] ]
+
+X &castBReftoXRef(B &b) {
+  return static_cast<X&>(b);
+}
+
+// CIR: cir.func {{.*}} @_Z14castBReftoXRefR1B(%[[ARG0:.*]]: !cir.ptr<!rec_B> {{.*}})
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["b", init, const]
+// CIR:   cir.store %[[ARG0]], %[[B_ADDR]] : !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>
+// CIR:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.ptr<!rec_B>>, !cir.ptr<!rec_B>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[B]] : !cir.ptr<!rec_B> nonnull [4] -> !cir.ptr<!rec_X>
+
+// LLVM: define {{.*}} ptr @_Z14castBReftoXRefR1B(ptr %[[ARG0:.*]])
+// LLVM:   %[[B_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// LLVM:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// LLVM:   %[[X:.*]] = getelementptr inbounds i8, ptr %[[B]], i32 -4
+
+// OGCG: define {{.*}} ptr @_Z14castBReftoXRefR1B(ptr {{.*}} %[[ARG0:.*]])
+// OGCG:   %[[B_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// OGCG:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// OGCG:   %[[X:.*]] = getelementptr inbounds i8, ptr %[[B]], i64 -4
diff --git a/clang/test/CIR/CodeGen/label-values.c b/clang/test/CIR/CodeGen/label-values.c
new file mode 100644
index 0000000000000..41178e3f62f20
--- /dev/null
+++ b/clang/test/CIR/CodeGen/label-values.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir  %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+
+void A(void) {
+  void *ptr = &&LABEL_A;
+LABEL_A:
+  return;
+}
+// CIR:  cir.func dso_local @A
+// CIR:    [[PTR:%.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+// CIR:    [[BLOCK:%.*]] = cir.block_address <@A, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) [[BLOCK]], [[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    cir.return
+
+void B(void) {
+LABEL_B:
+  void *ptr = &&LABEL_B;
+}
+
+// CIR:  cir.func dso_local @B()
+// CIR:    [[PTR:%.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+// CIR:    cir.br ^bb1
+// CIR:   ^bb1:
+// CIR:    cir.label "LABEL_B"
+// CIR:    [[BLOCK:%.*]] = cir.block_address <@B, "LABEL_B"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) [[BLOCK]], [[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.return
+
+void C(int x) {
+    void *ptr = (x == 0) ? &&LABEL_A : &&LABEL_B;
+LABEL_A:
+    return;
+LABEL_B:
+    return;
+}
+
+// CIR:  cir.func dso_local @C
+// CIR:    [[BLOCK1:%.*]] = cir.block_address <@C, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    [[BLOCK2:%.*]] = cir.block_address <@C, "LABEL_B"> : !cir.ptr<!void>
+// CIR:    [[COND:%.*]] = cir.select if [[CMP:%.*]] then [[BLOCK1]] else [[BLOCK2]] : (!cir.bool, !cir.ptr<!void>, !cir.ptr<!void>) -> !cir.ptr<!void>
+// CIR:    cir.store align(8) [[COND]], [[PTR:%.*]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    cir.br ^bb2
+// CIR:  ^bb2:  // 2 preds: ^bb1, ^bb3
+// CIR:    cir.return
+// CIR:  ^bb3:  // no predecessors
+// CIR:    cir.label "LABEL_B"
+// CIR:    cir.br ^bb2
+
+void D(void) {
+  void *ptr = &&LABEL_A;
+  void *ptr2 = &&LABEL_A;
+LABEL_A:
+  void *ptr3 = &&LABEL_A;
+  return;
+}
+
+// CIR:  cir.func dso_local @D
+// CIR:    %[[PTR:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init]
+// CIR:    %[[PTR2:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr2", init]
+// CIR:    %[[PTR3:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr3", init]
+// CIR:    %[[BLK1:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK1]], %[[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    %[[BLK2:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK2]], %[[PTR2]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    %[[BLK3:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK3]], %[[PTR3]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.return
diff --git a/clang/test/CIR/IR/block-adress.cir b/clang/test/CIR/IR/block-adress.cir
new file mode 100644
index 0000000000000..9d6840819c2d4
--- /dev/null
+++ b/clang/test/CIR/IR/block-adress.cir
@@ -0,0 +1,34 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!void = !cir.void
+
+module {
+  cir.func @block_address(){
+    %0 = cir.block_address <@block_address, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "label"
+    cir.return
+  }
+// CHECK: cir.func @block_address
+// CHECK: %0 = cir.block_address <@block_address, "label"> : !cir.ptr<!void>
+// CHECK:   cir.br ^bb1
+// CHECK: ^bb1:
+// CHECK:   cir.label "label"
+// CHECK:   cir.return
+
+cir.func @block_address_inside_scope() -> () {
+  cir.scope{
+    %0 = cir.block_address <@block_address_inside_scope, "label"> : !cir.ptr<!void>
+  }
+  cir.br ^bb1
+^bb1:
+  cir.label "label"
+  cir.return
+}
+// CHECK: cir.func @block_address_inside_scope
+// CHECK: cir.scope
+// CHECK:  %0 = cir.block_address <@block_address_inside_scope, "label"> : !cir.ptr<!void>
+// CHECK:  cir.label "label"
+// CHECK: cir.return
+}
diff --git a/clang/test/CIR/IR/invalid-block-address.cir b/clang/test/CIR/IR/invalid-block-address.cir
new file mode 100644
index 0000000000000..4519485c28803
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-block-address.cir
@@ -0,0 +1,21 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!void = !cir.void
+
+// expected-error@+1 {{expects an existing label target in the referenced function}}
+cir.func @bad_block_address() -> () {
+    %0 = cir.block_address <@bad_block_address, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "wrong_label"
+    cir.return
+}
+
+// expected-error@+1 {{blockaddress references a different function}}
+cir.func @bad_block_func() -> () {
+    %0 = cir.block_address <@mismatch_func, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "label"
+    cir.return
+}
diff --git a/clang/test/CIR/Transforms/goto_solver.cir b/clang/test/CIR/Transforms/goto_solver.cir
new file mode 100644
index 0000000000000..6ae019b44a39e
--- /dev/null
+++ b/clang/test/CIR/Transforms/goto_solver.cir
@@ -0,0 +1,62 @@
+// RUN: cir-opt %s -cir-goto-solver --verify-roundtrip -o - | FileCheck %s
+
+!void = !cir.void
+
+cir.func @a(){
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@a, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.br ^bb1
+^bb1:
+  cir.label "label1"
+  cir.br ^bb2
+^bb2:
+  // This label is not referenced by any blockaddressOp, so it should be removed
+  cir.label "label2"
+  cir.return
+}
+
+// CHECK:  cir.func @a()
+// CHECK:   %1 = cir.block_address <@a, "label1"> : !cir.ptr<!void>
+// CHECK: ^bb1:
+// CHECK:   cir.label "label1"
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb2:
+// CHECK-NOT: cir.label "label2"
+
+cir.func @b(){
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@b, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.goto "label2"
+^bb1:
+  cir.label "label1"
+  cir.br ^bb2
+^bb2:
+  // This label is not referenced by any blockaddressOp, so it should be removed
+  cir.label "label2"
+  cir.return
+}
+
+// CHECK: cir.func @b() {
+// CHECK:   %1 = cir.block_address <@b, "label1"> : !cir.ptr<!void>
+// CHECK:   cir.store align(8) %1, {{.*}} : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb1:
+// CHECK:   cir.label "label1"
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb2:
+// CHECK-NOT: cir.label "label2"
+
+cir.func @c() {
+  cir.label "label1"
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@c, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.return
+}
+
+// CHECK: cir.func @c
+// CHECK:   cir.label "label1"
+// CHECK:   %1 = cir.block_address <@c, "label1"> : !cir.ptr<!void>
+// CHECK:   cir.store align(8) %1, {{.*}} : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp
new file mode 100644
index 0000000000000..b4f30b533bc4b
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -fsyntax-only -Wno-all -Wunsafe-buffer-usage -verify %s -std=c++20
+// RUN: %clang_cc1 -fsyntax-only -Wno-all -Wunsafe-buffer-usage -verify %s -x c
+// expected-no-diagnostics
+
+typedef struct {} FILE;
+int fprintf( FILE* stream, const char* format, ... );
+FILE * stderr;
+
+#define DEBUG_ASSERT_MESSAGE(name, assertion, label, message, file, line, value) \
+  fprintf(stderr, "AssertMacros: %s, %s file: %s, line: %d, value: %lld\n",      \
+          assertion, (message!=0) ? message : "", file, line, (long long) (value));
+
+
+#define Require(assertion, exceptionLabel)                              \
+  do                                                                    \
+    {                                                                   \
+      if ( __builtin_expect(!(assertion), 0) ) {                        \
+        DEBUG_ASSERT_MESSAGE(                                           \
+	  "DEBUG_ASSERT_COMPONENT_NAME_STRING",                         \
+	  #assertion, #exceptionLabel, 0, __FILE__, __LINE__,  0);      \
+	goto exceptionLabel;                                            \
+      }									\
+    } while ( 0 )
+
+
+void f(int x, int y) {
+  Require(x == y, L1);
+ L1:
+  return;
+}
+
diff --git a/clang/tools/cir-opt/cir-opt.cpp b/clang/tools/cir-opt/cir-opt.cpp
index c4d29a2117c75..ee42015bb38e9 100644
--- a/clang/tools/cir-opt/cir-opt.cpp
+++ b/clang/tools/cir-opt/cir-opt.cpp
@@ -58,6 +58,10 @@ int main(int argc, char **argv) {
     return mlir::createHoistAllocasPass();
   });
 
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return mlir::createGotoSolverPass();
+  });
+
   mlir::registerTransformsPasses();
 
   return mlir::asMainReturnCode(MlirOptMain(
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 747b1a2233d32..c2401c86671d0 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1394,7 +1394,7 @@ uptr SizeClassAllocator64<Config>::releaseToOSMaybe(RegionInfo *Region,
                                             Region->FreeListInfo.PushedBlocks) *
                                                BlockSize;
     if (UNLIKELY(BytesInFreeList == 0))
-      return false;
+      return 0;
 
     // ==================================================================== //
     // 1. Check if we have enough free blocks and if it's worth doing a page
diff --git a/compiler-rt/test/fuzzer/merge-posix.test b/compiler-rt/test/fuzzer/merge-posix.test
index 2721668fb9706..5e342142216f8 100644
--- a/compiler-rt/test/fuzzer/merge-posix.test
+++ b/compiler-rt/test/fuzzer/merge-posix.test
@@ -14,7 +14,7 @@ RUN: echo ....U. > %tmp/T2/2
 RUN: echo ...Z.. > %tmp/T2/3
 RUN: echo ...Z.. > %tmp/T2/4
 RUN: echo ....E. > %tmp/T2/5
-RUN: echo .....R > %tmp/T2/6
+RUN: %python -c "print('.....R' + 'X' * 1024, end='')" > %tmp/T2/6
 
 # Check that we can report an error if file size exceeded
 RUN: (ulimit -f 1; not %run %t-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
index 46d25a4e386dc..1e9bd11d3f49c 100644
--- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
@@ -7,7 +7,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
 // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.functions
+// RUN: diff %t.normal.functions %t.functions
 
 // RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
@@ -17,7 +19,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.cov.profraw %run %t.cov.normal
 // RUN: llvm-profdata merge -o %t.cov.normal.profdata %t.cov.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.functions
+// RUN: diff %t.cov.normal.functions %t.cov.functions
 
 // Test debug info correlate with online merging.
 
@@ -30,11 +34,15 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.functions
+// RUN: diff %t.normal.functions %t.functions
 
 // RUN: rm -rf %t.profdir && mkdir %t.profdir
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov.dSYM %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.functions
+// RUN: diff %t.cov.normal.functions %t.cov.functions
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index ca66aab3140ee..915c8b4a5c6ce 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1244,7 +1244,8 @@ void ClauseProcessor::processMapObjects(
     std::string mapperIdName =
         typeSpec->name().ToString() + llvm::omp::OmpDefaultMapperName;
     if (auto *sym = converter.getCurrentScope().FindSymbol(mapperIdName)) {
-      mapperIdName = converter.mangleName(mapperIdName, sym->owner());
+      mapperIdName =
+          converter.mangleName(mapperIdName, sym->GetUltimate().owner());
     } else {
       mapperIdName = converter.mangleName(mapperIdName, *typeSpec->GetScope());
     }
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index ff72d09edeaf3..5af673001f07c 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2623,8 +2623,8 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               typeSpec->name().ToString() + llvm::omp::OmpDefaultMapperName;
           if (auto *mapperSym =
                   converter.getCurrentScope().FindSymbol(mapperIdName))
-            mapperIdName =
-                converter.mangleName(mapperIdName, mapperSym->owner());
+            mapperIdName = converter.mangleName(
+                mapperIdName, mapperSym->GetUltimate().owner());
           else
             mapperIdName =
                 converter.mangleName(mapperIdName, *typeSpec->GetScope());
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 4b3c0903b95ec..e9ecec5aae693 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2038,8 +2038,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
   if (beginName.v == llvm::omp::OMPD_master_taskloop ||
       beginName.v == llvm::omp::OMPD_master_taskloop_simd ||
       beginName.v == llvm::omp::OMPD_parallel_master_taskloop ||
-      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd ||
-      beginName.v == llvm::omp::Directive::OMPD_target_loop) {
+      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd) {
     unsigned version{context_.langOptions().OpenMPVersion};
     IssueNonConformanceWarning(beginName.v, beginName.source, version);
   }
@@ -3623,8 +3622,8 @@ void OmpAttributeVisitor::IssueNonConformanceWarning(llvm::omp::Directive D,
   case llvm::omp::OMPD_allocate:
     setAlternativeStr("ALLOCATORS");
     break;
-  case llvm::omp::OMPD_target_loop:
-  default:;
+  default:
+    break;
   }
   context_.Warn(common::UsageWarning::OpenMPUsage, source, "%s"_warn_en_US,
       warnStrOS.str());
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
index 9122661a2869a..70aaa6567597f 100644
--- a/flang/test/Lower/OpenMP/declare-mapper.f90
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -9,6 +9,8 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-7.mod.f90 -o - >/dev/null
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-7.use.f90 -o - | FileCheck %t/omp-declare-mapper-7.use.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-8.mod.f90 -o - >/dev/null
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-8.use.f90 -o - | FileCheck %t/omp-declare-mapper-8.use.f90
 
 !--- omp-declare-mapper-1.f90
 subroutine declare_mapper_1
@@ -26,7 +28,7 @@ subroutine declare_mapper_1
    end type
    type(my_type2)        :: t
    real                   :: x, y(nvals)
-   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type\.omp\.default\.mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
+   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type_omp_default_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
    !CHECK:      ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE]]>):
    !CHECK:        %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_1Evar"} : (!fir.ref<[[MY_TYPE]]>) -> (!fir.ref<[[MY_TYPE]]>, !fir.ref<[[MY_TYPE]]>)
    !CHECK:        %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0{"values"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -153,7 +155,7 @@ subroutine declare_mapper_4
       integer              :: num
    end type
 
-   !CHECK: omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_4my_type.omp.default.mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_4Tmy_type\{num:i32\}>]]
+   !CHECK: omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_4my_type_omp_default_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_4Tmy_type\{num:i32\}>]]
    !$omp declare mapper (my_type :: var) map (var%num)
 
    type(my_type) :: a
@@ -185,9 +187,9 @@ program declare_mapper_5
    end type
 
    !CHECK: omp.declare_mapper @[[INNER_MAPPER_NAMED:_QQFFuse_innermy_mapper]] : [[MY_TYPE:!fir\.type<_QFTmytype\{x:i32,y:i32\}>]]
-   !CHECK: omp.declare_mapper @[[INNER_MAPPER_DEFAULT:_QQFFuse_innermytype.omp.default.mapper]] : [[MY_TYPE]]
+   !CHECK: omp.declare_mapper @[[INNER_MAPPER_DEFAULT:_QQFFuse_innermytype_omp_default_mapper]] : [[MY_TYPE]]
    !CHECK: omp.declare_mapper @[[OUTER_MAPPER_NAMED:_QQFmy_mapper]] : [[MY_TYPE]]
-   !CHECK: omp.declare_mapper @[[OUTER_MAPPER_DEFAULT:_QQFmytype.omp.default.mapper]] : [[MY_TYPE]]
+   !CHECK: omp.declare_mapper @[[OUTER_MAPPER_DEFAULT:_QQFmytype_omp_default_mapper]] : [[MY_TYPE]]
    !$omp declare mapper(mytype :: var) map(tofrom: var%x)
    !$omp declare mapper(my_mapper : mytype :: var) map(tofrom: var%y)
 
@@ -325,3 +327,36 @@ program use_module_mapper
     a%x = 42
   !$omp end target
 end program use_module_mapper
+
+!--- omp-declare-mapper-8.mod.f90
+! Module with a default DECLARE MAPPER to be compiled separately.
+module default_mapper_mod
+  implicit none
+  type :: dtype
+    integer :: x
+  end type dtype
+  !$omp declare mapper(dtype :: v) map(tofrom: v%x)
+end module default_mapper_mod
+
+!--- omp-declare-mapper-8.use.f90
+! Consumer program that USEs the module and relies on the default mapper.
+! CHECK: omp.declare_mapper @{{.*dtype_omp_default_mapper}} : !fir.type<_QMdefault_mapper_modTdtype{x:i32}>
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(implicit, tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+program use_module_default_mapper
+  use default_mapper_mod
+  implicit none
+  type(dtype) :: a
+  !$omp target map(a)
+    a%x = 7
+  !$omp end target
+
+  !$omp target map(mapper(default) : a)
+    a%x = 8
+  !$omp end target
+
+  !$omp target
+    a%x = 8
+  !$omp end target
+end program use_module_default_mapper
diff --git a/flang/test/Lower/OpenMP/map-mapper.f90 b/flang/test/Lower/OpenMP/map-mapper.f90
index 91564bfc7bc46..8934fbb5d6edf 100644
--- a/flang/test/Lower/OpenMP/map-mapper.f90
+++ b/flang/test/Lower/OpenMP/map-mapper.f90
@@ -8,7 +8,7 @@ program p
    !$omp declare mapper(xx : t1 :: nn) map(to: nn, nn%x)
    !$omp declare mapper(t1 :: nn) map(from: nn)
 
-   !CHECK-LABEL: omp.declare_mapper @_QQFt1.omp.default.mapper : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
+   !CHECK-LABEL: omp.declare_mapper @_QQFt1_omp_default_mapper : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
    !CHECK-LABEL: omp.declare_mapper @_QQFxx : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
 
    type(t1) :: a, b
@@ -20,7 +20,7 @@ program p
    end do
    !$omp end target
 
-   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) mapper(@_QQFt1.omp.default.mapper) -> {{.*}} {name = "b"}
+   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) mapper(@_QQFt1_omp_default_mapper) -> {{.*}} {name = "b"}
    !CHECK: omp.target map_entries(%[[MAP_B]] -> %{{.*}}, %{{.*}} -> %{{.*}} : {{.*}}, {{.*}}) {
    !$omp target map(mapper(default) : b)
    do i = 1, n
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 4fa0c1a21c731..8b8a4f50279bb 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -530,7 +530,7 @@ subroutine omp_target_device_ptr
    use iso_c_binding, only : c_ptr, c_loc
    type(c_ptr) :: a
    integer, target :: b
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) mapper(@[[CPTR_DEFAULT:_QQM__fortran_builtinsc_ptr\.omp\.default\.mapper]]) -> {{.*}} {name = "a"}
+   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) mapper(@[[CPTR_DEFAULT:_QQM__fortran_builtinsc_ptr_omp_default_mapper]]) -> {{.*}} {name = "a"}
    !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}} -> %[[VAL_1:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
    !$omp target data map(tofrom: a) use_device_ptr(a)
    !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
diff --git a/flang/test/Parser/OpenMP/declare-mapper-unparse.f90 b/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
index b53bf5ce10557..9da6674c3a58d 100644
--- a/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
@@ -29,7 +29,7 @@ program main
 
 !PARSE-TREE:      OpenMPDeclareMapperConstruct
 !PARSE-TREE:        OmpMapperSpecifier
-!PARSE-TREE:         string = 'ty.omp.default.mapper'
+!PARSE-TREE:         string = 'ty_omp_default_mapper'
 !PARSE-TREE:         TypeSpec -> DerivedTypeSpec
 !PARSE-TREE:           Name = 'ty'
 !PARSE-TREE:         Name = 'mapped'
diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
index 50a38c6494aa6..7a627913f9555 100644
--- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
+++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
@@ -57,7 +57,7 @@ subroutine f01
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareMapperConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare mapper
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpMapperSpecifier
-!PARSE-TREE: | | string = 't.omp.default.mapper'
+!PARSE-TREE: | | string = 't_omp_default_mapper'
 !PARSE-TREE: | | TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 't'
 !PARSE-TREE: | | Name = 'v'
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
index 5d77540aa6453..9a1b86758357f 100644
--- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
+++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
@@ -13,7 +13,7 @@ program main
 !! Note, symbols come out in their respective scope, but not in declaration order.
 !CHECK: mymapper: MapperDetails
 !CHECK: ty: DerivedType components: x
-!CHECK: ty.omp.default.mapper: MapperDetails
+!CHECK: ty_omp_default_mapper: MapperDetails
 !CHECK: DerivedType scope: ty
 !CHECK: OtherConstruct scope:
 !CHECK: mapped (OmpMapToFrom) {{.*}} ObjectEntity type: TYPE(ty)
diff --git a/flang/test/Semantics/OpenMP/target-loop-still-there.f90 b/flang/test/Semantics/OpenMP/target-loop-still-there.f90
new file mode 100644
index 0000000000000..2d3b1820e23d4
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/target-loop-still-there.f90
@@ -0,0 +1,10 @@
+!RUN: %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=60 -Werror %s | FileCheck --allow-empty %s
+
+!CHECK-NOT: deprecated
+subroutine f00
+  implicit none
+  integer :: i
+  !$omp target loop
+  do i = 1, 10
+  end do
+end
diff --git a/libc/cmake/caches/armv6m-none-eabi.cmake b/libc/cmake/caches/armv6m-none-eabi.cmake
new file mode 100644
index 0000000000000..1f463ae5c0ead
--- /dev/null
+++ b/libc/cmake/caches/armv6m-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv6m-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv6m -mcpu=cortex-m0plus -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv7em-none-eabi.cmake b/libc/cmake/caches/armv7em-none-eabi.cmake
new file mode 100644
index 0000000000000..afbe9c87dffe1
--- /dev/null
+++ b/libc/cmake/caches/armv7em-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv7em-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv7em -mcpu=cortex-m4 -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv7m-none-eabi.cmake b/libc/cmake/caches/armv7m-none-eabi.cmake
new file mode 100644
index 0000000000000..796adb2f31148
--- /dev/null
+++ b/libc/cmake/caches/armv7m-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv7m-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv7m -mcpu=cortex-m4 -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv8.1m.main-none-eabi.cmake b/libc/cmake/caches/armv8.1m.main-none-eabi.cmake
new file mode 100644
index 0000000000000..4095facce46ac
--- /dev/null
+++ b/libc/cmake/caches/armv8.1m.main-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv8.1m.main-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-mfloat-abi=hard -march=armv8.1-m.main+mve.fp+fp.dp -mcpu=cortex-m55" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv8m.main-none-eabi.cmake b/libc/cmake/caches/armv8m.main-none-eabi.cmake
new file mode 100644
index 0000000000000..4b69f6a822e71
--- /dev/null
+++ b/libc/cmake/caches/armv8m.main-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv8m.main-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/baremetal_common.cmake b/libc/cmake/caches/baremetal_common.cmake
new file mode 100644
index 0000000000000..c0d665d790393
--- /dev/null
+++ b/libc/cmake/caches/baremetal_common.cmake
@@ -0,0 +1,21 @@
+# Expects target triple to be passed as `RUNTIMES_TARGET_TRIPLE`
+
+set(CMAKE_SYSTEM_NAME Generic CACHE STRING "")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "libc" CACHE STRING "")
+set(LLVM_INCLUDE_TESTS OFF CACHE BOOL "")
+set(CMAKE_C_COMPILER_WORKS ON CACHE BOOL "")
+set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
+set(CMAKE_SYSROOT "" CACHE STRING "")
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(CMAKE_ASM_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(LLVM_DEFAULT_TARGET_TRIPLE ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(LIBC_TARGET_TRIPLE ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+
+set(LLVM_LIBC_FULL_BUILD "ON" CACHE BOOL "")
diff --git a/libc/cmake/caches/riscv32-unknown-elf.cmake b/libc/cmake/caches/riscv32-unknown-elf.cmake
new file mode 100644
index 0000000000000..960fb2bb51a4f
--- /dev/null
+++ b/libc/cmake/caches/riscv32-unknown-elf.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_PROCESSOR RISCV CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "riscv32-unknown-elf" CACHE STRING "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index d3bcad470b3e1..5036c9438a503 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -398,9 +398,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wmemchr
     libc.src.wchar.wcpcpy
     libc.src.wchar.wcpncpy
+    libc.src.wchar.wcstod
     libc.src.wchar.wcstof
     libc.src.wchar.wcstok
     libc.src.wchar.wcstol
+    libc.src.wchar.wcstold
     libc.src.wchar.wcstoll
     libc.src.wchar.wcstoul
     libc.src.wchar.wcstoull
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index faceb9bb4e12d..a524c7f56bed0 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -367,3 +367,17 @@ functions:
     arguments:
       - type: const wchar_t *__restrict
       - type: wchar_t **__restrict
+  - name: wcstod
+    standards:
+      - stdc
+    return_type: double
+    arguments:
+      - type: const wchar_t *__restrict
+      - type: wchar_t **__restrict
+  - name: wcstold
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: const wchar_t *__restrict
+      - type: wchar_t **__restrict
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index e3fac9fb80529..e6d9af9eacf73 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -110,6 +110,28 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  wcstod
+  SRCS
+    wcstod.cpp
+  HDRS
+    wcstod.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
+  wcstold
+  SRCS
+    wcstold.cpp
+  HDRS
+    wcstold.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   wcstok
   SRCS
diff --git a/libc/src/wchar/wcstod.cpp b/libc/src/wchar/wcstod.cpp
new file mode 100644
index 0000000000000..95351c304c0ff
--- /dev/null
+++ b/libc/src/wchar/wcstod.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of wcstod ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstod.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, wcstod,
+                   (const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end)) {
+  auto result = internal::strtofloatingpoint<double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<wchar_t *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcstod.h b/libc/src/wchar/wcstod.h
new file mode 100644
index 0000000000000..ff397b93d405d
--- /dev/null
+++ b/libc/src/wchar/wcstod.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for wcstod ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOD_H
+#define LLVM_LIBC_SRC_WCHAR_WCSTOD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double wcstod(const wchar_t *__restrict str, wchar_t **__restrict str_end);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCSTOD_H
diff --git a/libc/src/wchar/wcstold.cpp b/libc/src/wchar/wcstold.cpp
new file mode 100644
index 0000000000000..ffbc3f248b883
--- /dev/null
+++ b/libc/src/wchar/wcstold.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of wcstold -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstold.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long double, wcstold,
+                   (const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end)) {
+  auto result = internal::strtofloatingpoint<long double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<wchar_t *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcstold.h b/libc/src/wchar/wcstold.h
new file mode 100644
index 0000000000000..1525362b33571
--- /dev/null
+++ b/libc/src/wchar/wcstold.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for wcstold -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
+#define LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long double wcstold(const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 122cad2575327..a62a30fe00124 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -538,5 +538,32 @@ add_libc_test(
   DEPENDS
     libc.src.wchar.wcstof
     libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.LibcFPTestHelpers
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+)
+
+add_libc_test(
+  wcstod_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcstod_test.cpp
+  DEPENDS
+    libc.src.wchar.wcstod
+    libc.test.UnitTest.ErrnoCheckingTest
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+)
+
+add_libc_test(
+  wcstold_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcstold_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.uint128
+    libc.src.wchar.wcstold
+    libc.test.UnitTest.ErrnoCheckingTest
 )
diff --git a/libc/test/src/wchar/wcstod_test.cpp b/libc/test/src/wchar/wcstod_test.cpp
new file mode 100644
index 0000000000000..0c2b82cfba898
--- /dev/null
+++ b/libc/test/src/wchar/wcstod_test.cpp
@@ -0,0 +1,586 @@
+//===-- Unittests for wcstod ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstod.h"
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/RoundingModeUtils.h"
+#include "test/UnitTest/Test.h"
+
+#include <stddef.h>
+
+using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest;
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+class LlvmLibcWcstodTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
+                           ForceRoundingModeTest<RoundingMode::Nearest> {
+public:
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const uint64_t expectedRawData, const int expectedErrno = 0) {
+    // expectedRawData is the expected double result as a uint64_t, organized
+    // according to IEEE754:
+    //
+    // +-- 1 Sign Bit                        +-- 52 Mantissa bits
+    // |                                     |
+    // |           +-------------------------+------------------------+
+    // |           |                                                  |
+    // SEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+    //  |         |
+    //  +----+----+
+    //       |
+    //       +-- 11 Exponent Bits
+    //
+    //  This is so that the result can be compared in parts.
+    wchar_t *str_end = nullptr;
+
+    LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
+        LIBC_NAMESPACE::fputil::FPBits<double>(expectedRawData);
+
+    double result = LIBC_NAMESPACE::wcstod(inputString, &str_end);
+    if (expectedErrno == 0)
+      EXPECT_THAT(result, Succeeds<double>(expected_fp.get_val()));
+    else
+      EXPECT_THAT(result, Fails<double>(expectedErrno, expected_fp.get_val()));
+    EXPECT_EQ(str_end - inputString, expectedStrLen);
+  }
+};
+
+TEST_F(LlvmLibcWcstodTest, SimpleTest) {
+  run_test(L"123", 3, uint64_t(0x405ec00000000000));
+
+  // This should fail on Eisel-Lemire, forcing a fallback to simple decimal
+  // conversion.
+  run_test(L"12345678901234549760", 20, uint64_t(0x43e56a95319d63d8));
+
+  // Found while looking for difficult test cases here:
+  // https://github.com/nigeltao/parse-number-fxx-test-data/blob/main/more-test-cases/golang-org-issue-36657.txt
+  run_test(L"1090544144181609348835077142190", 31,
+           uint64_t(0x462b8779f2474dfb));
+
+  run_test(L"0x123", 5, uint64_t(0x4072300000000000));
+}
+
+// These are tests that have caused problems in the past.
+TEST_F(LlvmLibcWcstodTest, SpecificFailures) {
+  run_test(L"3E70000000000000", 16, uint64_t(0x7FF0000000000000), ERANGE);
+  run_test(L"358416272e-33", 13, uint64_t(0x3adbbb2a68c9d0b9));
+  run_test(L"2.16656806400000023841857910156251e9", 36,
+           uint64_t(0x41e0246690000001));
+  run_test(L"27949676547093071875", 20, uint64_t(0x43f83e132bc608c9));
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000e-800",
+      806, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000e-799",
+      806, 0x4024000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "00000000000e-800",
+      807, 0x4024000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000e-64",
+      69, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000000000000000000000000000000000000000000000000000e-128",
+      134, 0x3ff0000000000000);
+  run_test(L"100000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "000000000000000000000000000000000000000000000000000000000e-256",
+           262, 0x3ff0000000000000);
+  run_test(L"100000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "000000000000000000000000000000000000000000000e-512",
+           518, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000e-1024",
+      1031, 0x3ff0000000000000);
+  run_test(
+      L"0"
+      "100000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "00000000000000000e-1024",
+      1032, 0x3ff0000000000000);
+}
+
+TEST_F(LlvmLibcWcstodTest, FuzzFailures) {
+  run_test(L"-\xff\xff\xff\xff\xff\xff\xff\x01", 0, uint64_t(0));
+  run_test(L"-.????", 0, uint64_t(0));
+  run_test(
+      L"44444444444444444444444444444444444444444444444444A44444444444444444"
+      "44444444444*\x99\xff\xff\xff\xff",
+      50, uint64_t(0x4a3e68fdd0e0b2d8));
+  run_test(L"-NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNKNNNNNNNNNNNNNNNNNN?"
+           "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN?",
+           0, uint64_t(0));
+  run_test(L"0x.666E40", 9, uint64_t(0x3fd99b9000000000));
+
+  // glibc version 2.36 and higher (not tested with lower versions) disagrees
+  // with this result, but ours is correct for the nearest rounding mode. See
+  // this bug: https://sourceware.org/bugzilla/show_bug.cgi?id=30220
+  run_test(L"0x30000002222225p-1077", 22, uint64_t(0x0006000000444445), ERANGE);
+
+  // This value triggered a bug by having an exponent exactly equal to the
+  // maximum. The overflow checks would accept a value less than the max value
+  // as valid and greater than the max value as invalid (and set it to the max),
+  // but an exponent of exactly max value hit the else condition which is
+  // intended for underflow and set the exponent to the min exponent.
+  run_test(
+      L"18477446000000000000000000000000000005230000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000935166201543003765631683711878842"
+      "388777446000000000000430037600000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000005238581124701719460000000"
+      "000000000017194600000000000000000070046000000000000000000000000100000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000002000000000000000"
+      "000000000000056316837118788423887774460000000000000000000000000000052385"
+      "811247017194600000000000000000171946000000000000000000700460000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000002000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000523858112470171946000000"
+      "000000000001719460000000000000000007004600000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0200000000000000000E608",
+      1462, uint64_t(0x7ff0000000000000), ERANGE);
+
+  // Same as above but for hex.
+  run_test(L"0x0164810157p2047", 17, uint64_t(0x7ff0000000000000), ERANGE);
+
+  // This test ensures that only the correct number of characters is accepted.
+  // An exponent symbol followed by a sign isn't a valid exponent.
+  run_test(L"2e+", 1, uint64_t(0x4000000000000000));
+  run_test(L"0x2p+", 3, uint64_t(0x4000000000000000));
+
+  // This bug was in the handling of very large exponents in the exponent
+  // marker. Previously anything greater than 10,000 would be set to 10,000.
+  // This caused incorrect behavior if there were more than 10,000 '0's in the
+  // input number, and then a correspondingly large exponent. This test case has
+  // 24,744 zeroes.
+  run_test(
+      L"0x."
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000fp551615",
+      24755, uint64_t(0x7ff0000000000000), ERANGE);
+}
diff --git a/libc/test/src/wchar/wcstold_test.cpp b/libc/test/src/wchar/wcstold_test.cpp
new file mode 100644
index 0000000000000..3a7fdfce3e732
--- /dev/null
+++ b/libc/test/src/wchar/wcstold_test.cpp
@@ -0,0 +1,262 @@
+//===-- Unittests for wcstold ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/uint128.h"
+#include "src/wchar/wcstold.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+#include <stddef.h>
+
+#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
+#define SELECT_CONST(val, _, __) val
+#elif defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80)
+#define SELECT_CONST(_, val, __) val
+#elif defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT128)
+#define SELECT_CONST(_, __, val) val
+#else
+#error "Unknown long double type"
+#endif
+
+class LlvmLibcWcstoldTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+public:
+#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const uint64_t expectedRawData, const int expectedErrno = 0)
+#else
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const UInt128 expectedRawData, const int expectedErrno = 0)
+#endif
+  {
+    // expectedRawData64 is the expected long double result as a uint64_t,
+    // organized according to the IEEE754 double precision format:
+    //
+    // +-- 1 Sign Bit                        +-- 52 Mantissa bits
+    // |                                     |
+    // |           +-------------------------+------------------------+
+    // |           |                                                  |
+    // SEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+    //  |         |
+    //  +----+----+
+    //       |
+    //       +-- 11 Exponent Bits
+
+    // expectedRawData80 is the expected long double result as a UInt128,
+    // organized according to the x86 extended precision format:
+    //
+    // +-- 1 Sign Bit
+    // |
+    // |               +-- 1 Integer part bit (1 unless this is a subnormal)
+    // |               |
+    // SEEEEEEEEEEEEEEEIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...M
+    //  |             | |                                                      |
+    //  +------+------+ +---------------------------+--------------------------+
+    //         |                                    |
+    //         +-- 15 Exponent Bits                 +-- 63 Mantissa bits
+
+    // expectedRawData128 is the expected long double result as a UInt128,
+    // organized according to IEEE754 quadruple precision format:
+    //
+    // +-- 1 Sign Bit                               +-- 112 Mantissa bits
+    // |                                            |
+    // |               +----------------------------+--------------------------+
+    // |               |                                                       |
+    // SEEEEEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...M
+    //  |             |
+    //  +------+------+
+    //         |
+    //         +-- 15 Exponent Bits
+    wchar_t *str_end = nullptr;
+
+    using FPBits = LIBC_NAMESPACE::fputil::FPBits<long double>;
+    FPBits expected_fp =
+        FPBits(static_cast<FPBits::StorageType>(expectedRawData));
+    const int expected_errno = expectedErrno;
+
+    long double result = LIBC_NAMESPACE::wcstold(inputString, &str_end);
+
+    LIBC_NAMESPACE::fputil::FPBits<long double> actual_fp =
+        LIBC_NAMESPACE::fputil::FPBits<long double>();
+    actual_fp = LIBC_NAMESPACE::fputil::FPBits<long double>(result);
+
+    EXPECT_EQ(str_end - inputString, expectedStrLen);
+
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
+    EXPECT_EQ(actual_fp.is_neg(), expected_fp.is_neg());
+    EXPECT_EQ(actual_fp.get_exponent(), expected_fp.get_exponent());
+    EXPECT_EQ(actual_fp.get_mantissa(), expected_fp.get_mantissa());
+    ASSERT_ERRNO_EQ(expected_errno);
+  }
+};
+
+TEST_F(LlvmLibcWcstoldTest, SimpleTest) {
+  run_test(L"123", 3,
+           SELECT_CONST(uint64_t(0x405ec00000000000),
+                        UInt128(0x4005f60000) << 40,
+                        UInt128(0x4005ec0000000000) << 64));
+
+  // This should fail on Eisel-Lemire, forcing a fallback to simple decimal
+  // conversion.
+  run_test(L"12345678901234549760", 20,
+           SELECT_CONST(uint64_t(0x43e56a95319d63d8),
+                        (UInt128(0x403eab54a9) << 40) + UInt128(0x8ceb1ec400),
+                        (UInt128(0x403e56a95319d63d) << 64) +
+                            UInt128(0x8800000000000000)));
+
+  // Found while looking for difficult test cases here:
+  // https://github.com/nigeltao/parse-number-fxx-test-data/blob/main/more-test-cases/golang-org-issue-36657.txt
+  run_test(L"1090544144181609348835077142190", 31,
+           SELECT_CONST(uint64_t(0x462b8779f2474dfb),
+                        (UInt128(0x4062dc3bcf) << 40) + UInt128(0x923a6fd402),
+                        (UInt128(0x4062b8779f2474df) << 64) +
+                            UInt128(0xa804bfd8c6d5c000)));
+
+  run_test(L"0x123", 5,
+           SELECT_CONST(uint64_t(0x4072300000000000),
+                        (UInt128(0x4007918000) << 40),
+                        (UInt128(0x4007230000000000) << 64)));
+}
+
+// These are tests that have caused problems for doubles in the past.
+TEST_F(LlvmLibcWcstoldTest, Float64SpecificFailures) {
+  run_test(L"3E70000000000000", 16,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)),
+           ERANGE);
+  run_test(L"358416272e-33", 13,
+           SELECT_CONST(uint64_t(0x3adbbb2a68c9d0b9),
+                        (UInt128(0x3fadddd953) << 40) + UInt128(0x464e85c400),
+                        (UInt128(0x3fadbbb2a68c9d0b) << 64) +
+                            UInt128(0x8800e7969e1c5fc8)));
+  run_test(L"2.16656806400000023841857910156251e9", 36,
+           SELECT_CONST(uint64_t(0x41e0246690000001),
+                        (UInt128(0x401e812334) << 40) + UInt128(0x8000000400),
+                        (UInt128(0x401e024669000000) << 64) +
+                            UInt128(0x800000000000018)));
+  run_test(L"27949676547093071875", 20,
+           SELECT_CONST(uint64_t(0x43f83e132bc608c9),
+                        (UInt128(0x403fc1f099) << 40) + UInt128(0x5e30464402),
+                        (UInt128(0x403f83e132bc608c) << 64) +
+                            UInt128(0x8803000000000000)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, Float80SpecificFailures) {
+  run_test(L"777777777777777777777777777777777777777777777777777777777777777777"
+           "7777777777777777777777777777777777",
+           100,
+           SELECT_CONST(uint64_t(0x54ac729b8fcaf734),
+                        (UInt128(0x414ae394dc) << 40) + UInt128(0x7e57b9a0c2),
+                        (UInt128(0x414ac729b8fcaf73) << 64) +
+                            UInt128(0x4184a3d793224129)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, MaxSizeNumbers) {
+  run_test(L"1.1897314953572317650e4932", 26,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7ffeffffff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7ffeffffffffffff) << 64) +
+                            UInt128(0xfffd57322e3f8675)),
+           SELECT_CONST(ERANGE, 0, 0));
+  run_test(L"1.18973149535723176508e4932", 27,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7ffeffffffffffff) << 64) +
+                            UInt128(0xffffd2478338036c)),
+           SELECT_CONST(ERANGE, ERANGE, 0));
+}
+
+// These tests check subnormal behavior for 80 bit and 128 bit floats. They will
+// be too small for 64 bit floats.
+TEST_F(LlvmLibcWcstoldTest, SubnormalTests) {
+  run_test(L"1e-4950", 7,
+           SELECT_CONST(uint64_t(0), (UInt128(0x00000000000000000003)),
+                        (UInt128(0x000000000000000000057c9647e1a018))),
+           ERANGE);
+  run_test(L"1.89e-4951", 10,
+           SELECT_CONST(uint64_t(0), (UInt128(0x00000000000000000001)),
+                        (UInt128(0x0000000000000000000109778a006738))),
+           ERANGE);
+  run_test(L"4e-4966", 7,
+           SELECT_CONST(uint64_t(0), (UInt128(0)),
+                        (UInt128(0x00000000000000000000000000000001))),
+           ERANGE);
+}
+
+TEST_F(LlvmLibcWcstoldTest, SmallNormalTests) {
+  run_test(L"3.37e-4932", 10,
+           SELECT_CONST(
+               uint64_t(0), (UInt128(0x1804cf7) << 40) + UInt128(0x908850712),
+               (UInt128(0x10099ee12110a) << 64) + UInt128(0xe24b75c0f50dc0c)),
+           SELECT_CONST(ERANGE, 0, 0));
+}
+
+TEST_F(LlvmLibcWcstoldTest, ComplexHexadecimalTests) {
+  run_test(L"0x1p16383", 9,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7ffe800000) << 40),
+                        (UInt128(0x7ffe000000000000) << 64)),
+           SELECT_CONST(ERANGE, 0, 0));
+  run_test(L"0x123456789abcdef", 17,
+           SELECT_CONST(0x43723456789abcdf,
+                        (UInt128(0x403791a2b3) << 40) + UInt128(0xc4d5e6f780),
+                        (UInt128(0x403723456789abcd) << 64) +
+                            UInt128(0xef00000000000000)));
+  run_test(L"0x123456789abcdef0123456789ABCDEF", 33,
+           SELECT_CONST(0x47723456789abcdf,
+                        (UInt128(0x407791a2b3) << 40) + UInt128(0xc4d5e6f781),
+                        (UInt128(0x407723456789abcd) << 64) +
+                            UInt128(0xef0123456789abce)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, InfTests) {
+  run_test(L"INF", 3,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)));
+  run_test(L"INFinity", 8,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)));
+  run_test(L"-inf", 4,
+           SELECT_CONST(0xfff0000000000000, (UInt128(0xffff800000) << 40),
+                        (UInt128(0xffff000000000000) << 64)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, NaNTests) {
+  run_test(L"NaN", 3,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+  run_test(L"-nAn", 4,
+           SELECT_CONST(0xfff8000000000000, (UInt128(0xffffc00000) << 40),
+                        (UInt128(0xffff800000000000) << 64)));
+  run_test(L"NaN()", 5,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+  run_test(L"NaN(1234)", 9,
+           SELECT_CONST(0x7ff80000000004d2,
+                        (UInt128(0x7fffc00000) << 40) + UInt128(0x4d2),
+                        (UInt128(0x7fff800000000000) << 64) + UInt128(0x4d2)));
+  run_test(L"NaN(0xffffffffffff)", 19,
+           SELECT_CONST(0x7ff8ffffffffffff,
+                        (UInt128(0x7fffc000ff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xffffffffffff)));
+  run_test(L"NaN(0xfffffffffffff)", 20,
+           SELECT_CONST(0x7fffffffffffffff,
+                        (UInt128(0x7fffc00fff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xfffffffffffff)));
+  run_test(L"NaN(0xffffffffffffffff)", 23,
+           SELECT_CONST(0x7fffffffffffffff,
+                        (UInt128(0x7fffffffff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xffffffffffffffff)));
+  run_test(L"NaN( 1234)", 3,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+}
diff --git a/libcxx/docs/VendorDocumentation.rst b/libcxx/docs/VendorDocumentation.rst
index 15677c7428263..b14c7a70aee04 100644
--- a/libcxx/docs/VendorDocumentation.rst
+++ b/libcxx/docs/VendorDocumentation.rst
@@ -81,12 +81,14 @@ CMake invocation at ``<monorepo>/llvm``:
 .. code-block:: bash
 
   $ mkdir build
-  $ cmake -G Ninja -S llvm -B build -DLLVM_ENABLE_PROJECTS="clang"                      \  # Configure
-                                    -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
-                                    -DLLVM_RUNTIME_TARGETS="<target-triple>"
-  $ ninja -C build runtimes                                                                # Build
-  $ ninja -C build check-runtimes                                                          # Test
-  $ ninja -C build install-runtimes                                                        # Install
+  $ cmake -G Ninja -S llvm -B build                                       \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo                               \
+          -DLLVM_ENABLE_PROJECTS="clang"                                  \  # Configure
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;compiler-rt" \
+          -DLLVM_RUNTIME_TARGETS="<target-triple>"
+  $ ninja -C build runtimes                                                  # Build
+  $ ninja -C build check-runtimes                                            # Test
+  $ ninja -C build install-runtimes                                          # Install
 
 .. note::
   - This type of build is also commonly called a "Runtimes build", but we would like to move
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index e1897949a47e6..ef487fb06dd5e 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -1910,6 +1910,8 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, const_it
         __bucket_list_[__next_chash] = __before_first;
         __chash                      = __next_chash;
       }
+    } else { // When __next is a nullptr we've fully erased the last bucket. Update the bucket list accordingly.
+      __bucket_list_[__chash] = nullptr;
     }
   }
 
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
index 532413437f6be..81371638143c9 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
@@ -57,6 +57,28 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_map<int, int> m;
+    m.insert(std::make_pair(1, 1));
+    m.insert(std::make_pair(2, 2));
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(std::make_pair(3, 3));
+    assert(m.size() == 1);
+    assert(*m.begin() == std::make_pair(3, 3));
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_map<int,
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
index 38b75c0c1986b..aa6bc20e4090b 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
@@ -122,6 +122,28 @@ int main(int, char**) {
     for (const auto& v : map)
       assert(v.first == 1 || v.first == collision_val);
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_multimap<int, int> m;
+    m.insert(std::make_pair(1, 1));
+    m.insert(std::make_pair(2, 2));
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(std::make_pair(3, 3));
+    assert(m.size() == 1);
+    assert(*m.begin() == std::make_pair(3, 3));
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multimap<int,
diff --git a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
index 3bc686ec2d86e..013e052e530de 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
@@ -64,6 +64,28 @@ int main(int, char**) {
     for (const auto& v : map)
       assert(v == 1 || v == collision_val);
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_multiset<int> m;
+    m.insert(1);
+    m.insert(2);
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(3);
+    assert(m.size() == 1);
+    assert(*m.begin() == 3);
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multiset<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;
diff --git a/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
index 5fa6e4199f756..1f049a295b8c3 100644
--- a/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
@@ -47,6 +47,28 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_set<int> m;
+    m.insert(1);
+    m.insert(2);
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(3);
+    assert(m.size() == 1);
+    assert(*m.begin() == 3);
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_set<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;
diff --git a/libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
similarity index 70%
rename from libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp
rename to libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
index 1a92cc925e2aa..281ef37e92d27 100644
--- a/libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp
+++ b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11
-// test <cstdio>
 
-// gets
+// Verify that std::gets has been removed in C++14 and later
 
 #include <cstdio>
 
-int main(int, char**)
-{
-    (void) std::gets((char *) NULL);
-
-  return 0;
+void f(char const* str) {
+  (void)std::gets(str); // expected-error {{no member named 'gets' in namespace 'std'}}
 }
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
deleted file mode 100644
index a03fd52c03562..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// template <size_t N>
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      const int (&submatches)[N],
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <vector>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        const int indices[] = {-1, 0, 1};
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), indices);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
deleted file mode 100644
index b6913e6b32d12..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      initializer_list<int> submatches,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                      std::regex("\\d{3}-\\d{4}"), {-1, 0, 1});
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
deleted file mode 100644
index 3c39d4983e26c..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re, int submatch = 0,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-\\d{4}");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), -1);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
new file mode 100644
index 0000000000000..b1ab0f337de2f
--- /dev/null
+++ b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11
+
+// Ensure that we don't allow iterators into temporary std::regex objects.
+
+// <regex>
+//
+// class regex_iterator<BidirectionalIterator, charT, traits>
+//
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re, int submatch = 0,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// template <size_t N>
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      const int (&submatches)[N],
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      initializer_list<int> submatches,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// template <std::size_t N>
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      const std::vector<int>& submatches,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+
+#include <iterator>
+#include <regex>
+#include <vector>
+
+void f() {
+  std::regex phone_numbers("\\d{3}-\\d{4}");
+  const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
+
+  { // int submatch
+    std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), -1);
+    // expected-error@-1 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // const int (&submatches)[N]
+    const int indices[] = {-1, 0, 1};
+    std::cregex_token_iterator i(
+        std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), indices);
+    // expected-error@-2 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // initializer_list<int> submatches
+    std::cregex_token_iterator i(
+        std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), {-1, 0, 1});
+    // expected-error@-2 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // const std::vector<int>& submatches
+    std::vector<int> v;
+    v.push_back(-1);
+    v.push_back(-1);
+    std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), v);
+    // expected-error@-1 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp
deleted file mode 100644
index 9b07df9d1a783..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// template <std::size_t N>
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      const std::vector<int>& submatches,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-         std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::vector<int> v;
-        v.push_back(-1);
-        v.push_back(-1);
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), v);
-    }
-
-  return 0;
-}
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index d265dddebe11f..7442361627104 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -366,11 +366,12 @@ bootstrapping-build)
           -DCMAKE_BUILD_TYPE=Release \
           -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
           -DLLVM_ENABLE_PROJECTS="clang;lldb" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;compiler-rt" \
           -DLLVM_RUNTIME_TARGETS="$(${CXX} --print-target-triple)" \
           -DLLVM_HOST_TRIPLE="$(${CXX} --print-target-triple)" \
           -DLLVM_TARGETS_TO_BUILD="host" \
           -DRUNTIMES_BUILD_ALLOW_DARWIN=ON \
+          -DCOMPILER_RT_INCLUDE_TESTS=OFF \
           -DLLVM_ENABLE_ASSERTIONS=ON \
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests"
 
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 9a70c0d19c41d..19b08152ae081 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2747,9 +2747,9 @@ RelroPaddingSection::RelroPaddingSection(Ctx &ctx)
     : SyntheticSection(ctx, ".relro_padding", SHT_NOBITS, SHF_ALLOC | SHF_WRITE,
                        1) {}
 
-PaddingSection::PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent)
-    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1),
-      size(size) {
+PaddingSection::PaddingSection(Ctx &ctx, uint64_t amount, OutputSection *parent)
+    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1) {
+  size = amount;
   this->parent = parent;
 }
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 38e68110e4bc0..66c866d7e8cde 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -78,8 +78,6 @@ class EhFrameSection final : public SyntheticSection {
   // allocating one for each EhInputSection.
   llvm::DenseMap<size_t, CieRecord *> offsetToCie;
 
-  uint64_t size = 0;
-
   template <llvm::endianness E> void addRecords(EhInputSection *s);
   template <class ELFT>
   void iterateFDEWithLSDAAux(EhInputSection &sec,
@@ -127,7 +125,6 @@ class GotSection final : public SyntheticSection {
 protected:
   size_t numEntries = 0;
   uint32_t tlsIndexOff = -1;
-  uint64_t size = 0;
   struct AuthEntryInfo {
     size_t offset;
     bool isSymbolFunc;
@@ -182,7 +179,6 @@ class BssSection final : public SyntheticSection {
   static bool classof(const SectionBase *s) {
     return isa<SyntheticSection>(s) && cast<SyntheticSection>(s)->bss;
   }
-  uint64_t size;
 };
 
 class MipsGotSection final : public SyntheticSection {
@@ -312,8 +308,6 @@ class MipsGotSection final : public SyntheticSection {
   // Number of "Header" entries.
   static const unsigned headerEntriesNum = 2;
 
-  uint64_t size = 0;
-
   // Symbol and addend.
   using GotEntry = std::pair<Symbol *, int64_t>;
 
@@ -407,8 +401,6 @@ class StringTableSection final : public SyntheticSection {
 private:
   const bool dynamic;
 
-  uint64_t size = 0;
-
   llvm::DenseMap<llvm::CachedHashStringRef, unsigned> stringMap;
   SmallVector<StringRef, 0> strings;
 };
@@ -475,7 +467,6 @@ template <class ELFT> class DynamicSection final : public SyntheticSection {
 
 private:
   std::vector<std::pair<int32_t, uint64_t>> computeContents();
-  uint64_t size = 0;
 };
 
 class RelocationBaseSection : public SyntheticSection {
@@ -780,10 +771,8 @@ class RelroPaddingSection final : public SyntheticSection {
 };
 
 class PaddingSection final : public SyntheticSection {
-  uint64_t size;
-
 public:
-  PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent);
+  PaddingSection(Ctx &ctx, uint64_t amount, OutputSection *parent);
   size_t getSize() const override { return size; }
   void writeTo(uint8_t *buf) override;
 };
diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index cf657aad5d145..b652d1ee8325f 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 
 #include <numeric>
 
@@ -246,33 +247,45 @@ DenseMap<const InputSection *, int> CallGraphSort::run() {
   return orderMap;
 }
 
-std::optional<int>
-macho::PriorityBuilder::getSymbolOrCStringPriority(const StringRef key,
-                                                   InputFile *f) {
+void macho::PriorityBuilder::SymbolPriorityEntry::setPriority(
+    int priority, StringRef objectFile) {
+  if (!objectFile.empty())
+    objectFiles.try_emplace(objectFile, priority);
+  else
+    anyObjectFile = std::min(anyObjectFile, priority);
+}
 
-  auto it = priorities.find(key);
-  if (it == priorities.end())
-    return std::nullopt;
-  const SymbolPriorityEntry &entry = it->second;
+int macho::PriorityBuilder::SymbolPriorityEntry::getPriority(
+    const InputFile *f) const {
   if (!f)
-    return entry.anyObjectFile;
+    return anyObjectFile;
   // We don't use toString(InputFile *) here because it returns the full path
   // for object files, and we only want the basename.
-  StringRef filename;
-  if (f->archiveName.empty())
-    filename = path::filename(f->getName());
-  else
-    filename = saver().save(path::filename(f->archiveName) + "(" +
-                            path::filename(f->getName()) + ")");
-  return std::min(entry.objectFiles.lookup(filename), entry.anyObjectFile);
+  StringRef basename = path::filename(f->getName());
+  StringRef filename =
+      f->archiveName.empty()
+          ? basename
+          : saver().save(path::filename(f->archiveName) + "(" + basename + ")");
+  return std::min(objectFiles.lookup(filename), anyObjectFile);
 }
 
 std::optional<int>
-macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
+macho::PriorityBuilder::getCStringPriority(uint32_t hash,
+                                           const InputFile *f) const {
+  auto it = cStringPriorities.find(hash);
+  if (it == cStringPriorities.end())
+    return std::nullopt;
+  return it->second.getPriority(f);
+}
+
+std::optional<int>
+macho::PriorityBuilder::getSymbolPriority(const Defined *sym) const {
   if (sym->isAbsolute())
     return std::nullopt;
-  return getSymbolOrCStringPriority(utils::getRootSymbol(sym->getName()),
-                                    sym->isec()->getFile());
+  auto it = priorities.find(utils::getRootSymbol(sym->getName()));
+  if (it == priorities.end())
+    return std::nullopt;
+  return it->second.getPriority(sym->isec()->getFile());
 }
 
 void macho::PriorityBuilder::extractCallGraphProfile() {
@@ -307,7 +320,7 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
   int prio = std::numeric_limits<int>::min();
   MemoryBufferRef mbref = *buffer;
   for (StringRef line : args::getLines(mbref)) {
-    StringRef objectFile, symbolOrCStrHash;
+    StringRef objectFile;
     line = line.take_until([](char c) { return c == '#'; }); // ignore comments
     line = line.ltrim();
 
@@ -338,22 +351,16 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
     }
 
     // The rest of the line is either <symbol name> or
-    // CStringEntryPrefix<cstring hash>
+    // cStringEntryPrefix<cstring hash>
     line = line.trim();
-    if (line.starts_with(CStringEntryPrefix)) {
-      StringRef possibleHash = line.drop_front(CStringEntryPrefix.size());
+    if (line.consume_front(cStringEntryPrefix)) {
       uint32_t hash = 0;
-      if (to_integer(possibleHash, hash))
-        symbolOrCStrHash = possibleHash;
-    } else
-      symbolOrCStrHash = utils::getRootSymbol(line);
-
-    if (!symbolOrCStrHash.empty()) {
-      SymbolPriorityEntry &entry = priorities[symbolOrCStrHash];
-      if (!objectFile.empty())
-        entry.objectFiles.insert(std::make_pair(objectFile, prio));
-      else
-        entry.anyObjectFile = std::min(entry.anyObjectFile, prio);
+      if (to_integer(line, hash))
+        cStringPriorities[hash].setPriority(prio, objectFile);
+    } else {
+      StringRef symbol = utils::getRootSymbol(line);
+      if (!symbol.empty())
+        priorities[symbol].setPriority(prio, objectFile);
     }
 
     ++prio;
@@ -405,40 +412,39 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
   return sectionPriorities;
 }
 
-std::vector<StringPiecePair> macho::PriorityBuilder::buildCStringPriorities(
-    ArrayRef<CStringInputSection *> inputs) {
-  // Split the input strings into hold and cold sets.
-  // Order hot set based on -order_file_cstring for performance improvement;
-  // TODO: Order cold set of cstrings for compression via BP.
-  std::vector<std::pair<int, StringPiecePair>>
-      hotStringPrioritiesAndStringPieces;
-  std::vector<StringPiecePair> coldStringPieces;
-  std::vector<StringPiecePair> orderedStringPieces;
-
+void macho::PriorityBuilder::forEachStringPiece(
+    ArrayRef<CStringInputSection *> inputs,
+    std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+    bool forceInputOrder, bool computeHash) const {
+  std::vector<std::tuple<int, CStringInputSection *, size_t>> orderedPieces;
+  std::vector<std::pair<CStringInputSection *, size_t>> unorderedPieces;
   for (CStringInputSection *isec : inputs) {
     for (const auto &[stringPieceIdx, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
         continue;
-
-      std::optional<int> priority = getSymbolOrCStringPriority(
-          std::to_string(piece.hash), isec->getFile());
-      if (!priority)
-        coldStringPieces.emplace_back(isec, stringPieceIdx);
+      // Process pieces in input order if we have no cstrings in our orderfile
+      if (forceInputOrder || cStringPriorities.empty()) {
+        f(*isec, piece, stringPieceIdx);
+        continue;
+      }
+      uint32_t hash =
+          computeHash
+              ? (xxh3_64bits(isec->getStringRef(stringPieceIdx)) & 0x7fffffff)
+              : piece.hash;
+      if (auto priority = getCStringPriority(hash, isec->getFile()))
+        orderedPieces.emplace_back(*priority, isec, stringPieceIdx);
       else
-        hotStringPrioritiesAndStringPieces.emplace_back(
-            *priority, std::make_pair(isec, stringPieceIdx));
+        unorderedPieces.emplace_back(isec, stringPieceIdx);
     }
   }
-
-  // Order hot set for perf
-  llvm::stable_sort(hotStringPrioritiesAndStringPieces);
-  for (auto &[priority, stringPiecePair] : hotStringPrioritiesAndStringPieces)
-    orderedStringPieces.push_back(stringPiecePair);
-
-  // TODO: Order cold set for compression
-
-  orderedStringPieces.insert(orderedStringPieces.end(),
-                             coldStringPieces.begin(), coldStringPieces.end());
-
-  return orderedStringPieces;
+  if (orderedPieces.empty() && unorderedPieces.empty())
+    return;
+  llvm::stable_sort(orderedPieces, [](const auto &left, const auto &right) {
+    return std::get<0>(left) < std::get<0>(right);
+  });
+  for (auto &[priority, isec, pieceIdx] : orderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
+  // TODO: Add option to order the remaining cstrings for compression
+  for (auto &[isec, pieceIdx] : unorderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
 }
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index cc4e30fffc600..24d2dbc47e498 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -16,7 +16,6 @@
 namespace lld::macho {
 
 using SectionPair = std::pair<const InputSection *, const InputSection *>;
-using StringPiecePair = std::pair<CStringInputSection *, size_t>;
 
 class PriorityBuilder {
 public:
@@ -29,7 +28,7 @@ class PriorityBuilder {
   //
   // An order file has one entry per line, in the following format:
   //
-  //   <cpu>:<object file>:[<symbol name> | CStringEntryPrefix <cstring hash>]
+  //   <cpu>:<object file>:[<symbol name> | cStringEntryPrefix <cstring hash>]
   //
   // <cpu> and <object file> are optional.
   // If not specified, then that entry tries to match either,
@@ -42,7 +41,7 @@ class PriorityBuilder {
   // lowest-ordered entry (the one nearest to the front of the list.)
   //
   // or 2) any cstring literal with the given hash, if the entry has the
-  // CStringEntryPrefix prefix defined below in the file. <cstring hash> is the
+  // cStringEntryPrefix prefix defined below in the file. <cstring hash> is the
   // hash of cstring literal content.
   //
   // Cstring literals are not symbolized, we can't identify them by name
@@ -54,6 +53,16 @@ class PriorityBuilder {
   // The file can also have line comments that start with '#'.
   void parseOrderFile(StringRef path);
 
+  /// Call \p f for each string piece in \p inputs. If there are any cstring
+  /// literals in the orderfile (and \p forceInputOrder is false) then string
+  /// pieces are ordered by the orderfile. \p computeHash must be set when
+  /// \p deduplicateLiterals is false because then the string piece hash is not
+  /// set.
+  void forEachStringPiece(
+      ArrayRef<CStringInputSection *> inputs,
+      std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+      bool forceInputOrder = false, bool computeHash = false) const;
+
   // Returns layout priorities for some or all input sections. Sections are laid
   // out in decreasing order; that is, a higher priority section will be closer
   // to the beginning of its output section.
@@ -66,8 +75,6 @@ class PriorityBuilder {
   // Each section gets assigned the priority of the highest-priority symbol it
   // contains.
   llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
-  std::vector<StringPiecePair>
-      buildCStringPriorities(ArrayRef<CStringInputSection *>);
 
 private:
   // The symbol with the smallest priority should be ordered first in the output
@@ -78,13 +85,16 @@ class PriorityBuilder {
     int anyObjectFile = 0;
     // The priority given to a matching symbol from a particular object file.
     llvm::DenseMap<llvm::StringRef, int> objectFiles;
+    void setPriority(int priority, StringRef objectFile);
+    int getPriority(const InputFile *f) const;
   };
-  const llvm::StringRef CStringEntryPrefix = "CSTR;";
+  const llvm::StringRef cStringEntryPrefix = "CSTR;";
 
-  std::optional<int> getSymbolPriority(const Defined *sym);
-  std::optional<int> getSymbolOrCStringPriority(const StringRef key,
-                                                InputFile *f);
+  std::optional<int> getSymbolPriority(const Defined *sym) const;
+  std::optional<int> getCStringPriority(uint32_t hash,
+                                        const InputFile *f) const;
   llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
+  llvm::DenseMap<uint32_t, SymbolPriorityEntry> cStringPriorities;
   llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
 };
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 187cccbe90dbc..fecc51f912b08 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1721,26 +1721,24 @@ void CStringSection::writeTo(uint8_t *buf) const {
 // and don't need this alignment. They will be emitted at some arbitrary address
 // `A`, but ld64 will treat them as being 16-byte aligned with an offset of
 // `16 % A`.
-static Align getStringPieceAlignment(const CStringInputSection *isec,
+static Align getStringPieceAlignment(const CStringInputSection &isec,
                                      const StringPiece &piece) {
-  return llvm::Align(1ULL << llvm::countr_zero(isec->align | piece.inSecOff));
+  return llvm::Align(1ULL << llvm::countr_zero(isec.align | piece.inSecOff));
 }
 
 void CStringSection::finalizeContents() {
   size = 0;
-  // TODO: Call buildCStringPriorities() to support cstring ordering when
-  // deduplication is off, although this may negatively impact build
-  // performance.
-  for (CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
-      StringRef string = isec->getStringRef(i);
-      size = piece.outSecOff + string.size() + 1; // account for null terminator
-    }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
+        StringRef string = isec.getStringRef(pieceIdx);
+        size =
+            piece.outSecOff + string.size() + 1; // account for null terminator
+      },
+      /*forceInputOrder=*/false, /*computeHash=*/true);
+  for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
-  }
 }
 
 void DeduplicatedCStringSection::finalizeContents() {
@@ -1748,20 +1746,19 @@ void DeduplicatedCStringSection::finalizeContents() {
   DenseMap<CachedHashStringRef, Align> strToAlignment;
   // Used for tail merging only
   std::vector<CachedHashStringRef> deduplicatedStrs;
-  for (const CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      auto s = isec->getCachedHashStringRef(i);
-      assert(isec->align != 0);
-      auto align = getStringPieceAlignment(isec, piece);
-      auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
-      if (config->tailMergeStrings && wasInserted)
-        deduplicatedStrs.push_back(s);
-      if (!wasInserted && it->second < align)
-        it->second = align;
-    }
-  }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        auto s = isec.getCachedHashStringRef(pieceIdx);
+        assert(isec.align != 0);
+        auto align = getStringPieceAlignment(isec, piece);
+        auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+        if (config->tailMergeStrings && wasInserted)
+          deduplicatedStrs.push_back(s);
+        if (!wasInserted && it->second < align)
+          it->second = align;
+      },
+      /*forceInputOrder=*/true);
 
   // Like lexigraphical sort, except we read strings in reverse and take the
   // longest string first
@@ -1801,9 +1798,10 @@ void DeduplicatedCStringSection::finalizeContents() {
   // Sort the strings for performance and compression size win, and then
   // assign an offset for each string and save it to the corresponding
   // StringPieces for easy access.
-  for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
-    auto &piece = isec->pieces[i];
-    auto s = isec->getCachedHashStringRef(i);
+  priorityBuilder.forEachStringPiece(inputs, [&](CStringInputSection &isec,
+                                                 StringPiece &piece,
+                                                 size_t pieceIdx) {
+    auto s = isec.getCachedHashStringRef(pieceIdx);
     // Any string can be tail merged with itself with an offset of zero
     uint64_t tailMergeOffset = 0;
     auto mergeIt =
@@ -1829,7 +1827,7 @@ void DeduplicatedCStringSection::finalizeContents() {
       stringOffsetMap[tailMergedString] = piece.outSecOff;
       assert(isAligned(strToAlignment.at(tailMergedString), piece.outSecOff));
     }
-  }
+  });
   for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
 }
diff --git a/lld/test/MachO/order-file-cstring.s b/lld/test/MachO/order-file-cstring.s
index 3c6d2a377dc38..d6734308fffdf 100644
--- a/lld/test/MachO/order-file-cstring.s
+++ b/lld/test/MachO/order-file-cstring.s
@@ -4,32 +4,34 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin  %t/test.s -o %t/test.o
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/more-cstrings.s -o %t/more-cstrings.o
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1 | FileCheck %s --check-prefix=ONE_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1 | FileCheck %s --check-prefix=ONE_SEC
 
+# RUN: %lld --no-deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1-dup %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1-dup | FileCheck %s --check-prefix=ONE_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1-dup | FileCheck %s --check-prefix=ONE_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-2 | FileCheck %s --check-prefix=TWO_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-2 | FileCheck %s --check-prefix=TWO_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-3 | FileCheck %s --check-prefix=THREE_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-3 | FileCheck %s --check-prefix=THREE_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-4 | FileCheck %s --check-prefix=FOUR_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC
 # RUN: llvm-readobj --string-dump=__cstring %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC_ESCAPE
 
-
 # We expect:
-# 1) Covered cstring symbols are reordered
-# 2) the rest of the cstring symbols remain original relative order within the cstring section
+# 1) Covered cstring symbols to be reordered
+# 2) the rest of the cstring symbols remain in the original relative order within the cstring section
 
 # ORIGIN_SYM: _local_foo1
 # ORIGIN_SYM: _globl_foo2
@@ -58,8 +60,8 @@ CSTR;1496286555
 #foo3
 CSTR;1343999025
 
-# ONE_SYM: _globl_foo2
-# ONE_SYM: _local_foo2
+# ONE_SYM-DAG: _globl_foo2
+# ONE_SYM-DAG: _local_foo2
 # ONE_SYM: _bar
 # ONE_SYM: _bar2
 # ONE_SYM: _globl_foo3
diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt
index 28a8af8f06319..2ebcf5a8e7aca 100644
--- a/lldb/bindings/python/CMakeLists.txt
+++ b/lldb/bindings/python/CMakeLists.txt
@@ -60,8 +60,10 @@ endfunction()
 function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_target_dir)
   # Add a Post-Build Event to copy over Python files and create the symlink to
   # liblldb.so for the Python API(hardlink on Windows).
+  # Note that Swig-generated code is located one level deeper in the `native`
+  # module, in order to avoid cyclic importing.
   add_custom_target(${swig_target} ALL VERBATIM
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir}/native/
     DEPENDS ${lldb_python_bindings_dir}/lldb.py
     COMMENT "Python script sym-linking LLDB Python API")
 
@@ -75,6 +77,8 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
       "${LLDB_SOURCE_DIR}/source/Interpreter/embedded_interpreter.py"
       "${lldb_python_target_dir}")
 
+  create_python_package(${swig_target} ${lldb_python_target_dir} "native" FILES)
+
   # Distribute the examples as python packages.
   create_python_package(
     ${swig_target}
@@ -143,7 +147,7 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
   endif()
   set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb${LLDB_PYTHON_EXT_SUFFIX}")
   create_relative_symlink(${swig_target} ${LIBLLDB_SYMLINK_DEST}
-                          ${lldb_python_target_dir} ${LIBLLDB_SYMLINK_OUTPUT_FILE})
+                          ${lldb_python_target_dir}/native/ ${LIBLLDB_SYMLINK_OUTPUT_FILE})
 
 
   if (NOT WIN32)
diff --git a/lldb/bindings/python/python.swig b/lldb/bindings/python/python.swig
index b2823f98acac8..3d2caa65f1658 100644
--- a/lldb/bindings/python/python.swig
+++ b/lldb/bindings/python/python.swig
@@ -50,7 +50,12 @@ Older swig versions will simply ignore this setting.
     import $module
 except ImportError:
     # Relative import should work if we are being loaded by Python.
-    from . import $module"
+    # The cpython module built by swig is pushed one level down into
+    # the native submodule, because at this point the interpreter
+    # is still constructing the lldb module itself.
+    # Simply importing anything using `from . import` constitutes
+    # a cyclic importing.
+    from .native import $module"
 %enddef
 
 // The name of the module to be created.
diff --git a/lldb/include/lldb/API/SBFile.h b/lldb/include/lldb/API/SBFile.h
index ebdc5607b7942..8cf4fe1b405fa 100644
--- a/lldb/include/lldb/API/SBFile.h
+++ b/lldb/include/lldb/API/SBFile.h
@@ -27,7 +27,10 @@ class LLDB_API SBFile {
   SBFile(FileSP file_sp);
 #ifndef SWIG
   SBFile(const SBFile &rhs);
+  LLDB_DEPRECATED_FIXME("Use the constructor that specifies mode instead",
+                        "SBFile(FILE*, const char*, bool)")
   SBFile(FILE *file, bool transfer_ownership);
+  SBFile(FILE *file, const char *mode, bool transfer_ownership);
 #endif
   SBFile(int fd, const char *mode, bool transfer_ownership);
   ~SBFile();
diff --git a/lldb/include/lldb/Host/File.h b/lldb/include/lldb/Host/File.h
index 7402a2231735a..590c9fa523b29 100644
--- a/lldb/include/lldb/Host/File.h
+++ b/lldb/include/lldb/Host/File.h
@@ -66,6 +66,9 @@ class File : public IOObject {
     LLVM_MARK_AS_BITMASK_ENUM(/* largest_value= */ eOpenOptionInvalid)
   };
 
+  static constexpr OpenOptions OpenOptionsModeMask =
+      eOpenOptionReadOnly | eOpenOptionWriteOnly | eOpenOptionReadWrite;
+
   static mode_t ConvertOpenOptionsForPOSIXOpen(OpenOptions open_options);
   static llvm::Expected<OpenOptions> GetOptionsFromMode(llvm::StringRef mode);
   static bool DescriptorIsValid(int descriptor) { return descriptor >= 0; };
@@ -384,7 +387,7 @@ class NativeFile : public File {
 
   NativeFile();
 
-  NativeFile(FILE *fh, bool transfer_ownership);
+  NativeFile(FILE *fh, OpenOptions options, bool transfer_ownership);
 
   NativeFile(int fd, OpenOptions options, bool transfer_ownership);
 
diff --git a/lldb/include/lldb/Host/StreamFile.h b/lldb/include/lldb/Host/StreamFile.h
index e37661a9938c0..8b01eeab6f586 100644
--- a/lldb/include/lldb/Host/StreamFile.h
+++ b/lldb/include/lldb/Host/StreamFile.h
@@ -81,7 +81,8 @@ class LockableStreamFile {
   LockableStreamFile(StreamFile &stream_file, Mutex &mutex)
       : m_file_sp(stream_file.GetFileSP()), m_mutex(mutex) {}
   LockableStreamFile(FILE *fh, bool transfer_ownership, Mutex &mutex)
-      : m_file_sp(std::make_shared<NativeFile>(fh, transfer_ownership)),
+      : m_file_sp(std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                               transfer_ownership)),
         m_mutex(mutex) {}
   LockableStreamFile(std::shared_ptr<File> file_sp, Mutex &mutex)
       : m_file_sp(file_sp), m_mutex(mutex) {}
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 0122fe8409c29..55dbd3934860f 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -339,9 +339,11 @@ endif
 # library to make ASAN tests work for most users, including the bots.
 ifeq "$(OS)" "Darwin"
 ifneq "$(ASAN_OPTIONS)" ""
-LDFLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
+ASAN_LDFLAGS = -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
 endif
 endif
+LDFLAGS += $(ASAN_LDFLAGS)
+
 OBJECTS =
 EXE ?= a.out
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index ac550962cfb85..f85ab1910a2eb 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -191,6 +191,11 @@ class NotSupportedError(KeyError):
 
 
 class DebugCommunication(object):
+    @property
+    def is_stopped(self) -> bool:
+        """Returns True if the debuggee is stopped, otherwise False."""
+        return len(self.thread_stop_reasons) > 0 or self.exit_status is not None
+
     def __init__(
         self,
         recv: BinaryIO,
@@ -860,7 +865,17 @@ def request_configurationDone(self):
         response = self._send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
+            stopped_on_entry = self.is_stopped
             self.request_threads()
+            if not stopped_on_entry:
+                # Drop the initial cached threads if we did not stop-on-entry.
+                # In VSCode, immediately following 'configurationDone', a
+                # 'threads' request is made to get the initial set of threads,
+                # specifically the main threads id and name.
+                # We issue the threads request to mimic this pattern but in our
+                # tests we don't want to cache the result unless the process is
+                # actually stopped.
+                self.threads = None
         return response
 
     def _process_stopped(self):
@@ -978,9 +993,10 @@ def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None
             return []
         args_dict = {
             "expression": expression,
-            "context": context,
             "frameId": stackFrame["id"],
         }
+        if context:
+            args_dict["context"] = context
         command_dict = {
             "command": "evaluate",
             "type": "request",
diff --git a/lldb/source/API/SBCommandReturnObject.cpp b/lldb/source/API/SBCommandReturnObject.cpp
index e78e213aa23af..da7e288e38d28 100644
--- a/lldb/source/API/SBCommandReturnObject.cpp
+++ b/lldb/source/API/SBCommandReturnObject.cpp
@@ -15,6 +15,7 @@
 #include "lldb/API/SBValue.h"
 #include "lldb/API/SBValueList.h"
 #include "lldb/Core/StructuredDataImpl.h"
+#include "lldb/Host/File.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/Instrumentation.h"
@@ -275,14 +276,16 @@ void SBCommandReturnObject::SetImmediateErrorFile(FILE *fh) {
 void SBCommandReturnObject::SetImmediateOutputFile(FILE *fh,
                                                    bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  FileSP file = std::make_shared<NativeFile>(fh, transfer_ownership);
+  FileSP file = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                             transfer_ownership);
   ref().SetImmediateOutputFile(file);
 }
 
 void SBCommandReturnObject::SetImmediateErrorFile(FILE *fh,
                                                   bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  FileSP file = std::make_shared<NativeFile>(fh, transfer_ownership);
+  FileSP file = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                             transfer_ownership);
   ref().SetImmediateErrorFile(file);
 }
 
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 5c4c653d95a81..7a4bebfdf998e 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -327,8 +327,8 @@ void SBDebugger::SkipAppInitFiles(bool b) {
 void SBDebugger::SetInputFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
   if (m_opaque_sp)
-    m_opaque_sp->SetInputFile(
-        (FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+    m_opaque_sp->SetInputFile((FileSP)std::make_shared<NativeFile>(
+        fh, File::eOpenOptionReadOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetInputString(const char *data) {
@@ -385,7 +385,8 @@ SBError SBDebugger::SetOutputFile(FileSP file_sp) {
 
 void SBDebugger::SetOutputFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  SetOutputFile((FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+  SetOutputFile((FileSP)std::make_shared<NativeFile>(
+      fh, File::eOpenOptionWriteOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetOutputFile(SBFile file) {
@@ -405,7 +406,8 @@ SBError SBDebugger::SetOutputFile(SBFile file) {
 
 void SBDebugger::SetErrorFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  SetErrorFile((FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+  SetErrorFile((FileSP)std::make_shared<NativeFile>(
+      fh, File::eOpenOptionWriteOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetErrorFile(FileSP file_sp) {
@@ -576,8 +578,10 @@ void SBDebugger::HandleProcessEvent(const SBProcess &process,
                                     FILE *err) {
   LLDB_INSTRUMENT_VA(this, process, event, out, err);
 
-  FileSP outfile = std::make_shared<NativeFile>(out, false);
-  FileSP errfile = std::make_shared<NativeFile>(err, false);
+  FileSP outfile =
+      std::make_shared<NativeFile>(out, File::eOpenOptionWriteOnly, false);
+  FileSP errfile =
+      std::make_shared<NativeFile>(err, File::eOpenOptionWriteOnly, false);
   return HandleProcessEvent(process, event, outfile, errfile);
 }
 
diff --git a/lldb/source/API/SBFile.cpp b/lldb/source/API/SBFile.cpp
index 2ae4b1481afbf..56909923d4b2d 100644
--- a/lldb/source/API/SBFile.cpp
+++ b/lldb/source/API/SBFile.cpp
@@ -39,7 +39,22 @@ SBFile::SBFile() { LLDB_INSTRUMENT_VA(this); }
 SBFile::SBFile(FILE *file, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, file, transfer_ownership);
 
-  m_opaque_sp = std::make_shared<NativeFile>(file, transfer_ownership);
+  // For backwards comptability, this defaulted to ReadOnly previously.
+  m_opaque_sp = std::make_shared<NativeFile>(file, File::eOpenOptionReadOnly,
+                                             transfer_ownership);
+}
+
+SBFile::SBFile(FILE *file, const char *mode, bool transfer_ownership) {
+  LLDB_INSTRUMENT_VA(this, file, transfer_ownership);
+
+  auto options = File::GetOptionsFromMode(mode);
+  if (!options) {
+    llvm::consumeError(options.takeError());
+    return;
+  }
+
+  m_opaque_sp =
+      std::make_shared<NativeFile>(file, options.get(), transfer_ownership);
 }
 
 SBFile::SBFile(int fd, const char *mode, bool transfer_owndership) {
diff --git a/lldb/source/API/SBInstruction.cpp b/lldb/source/API/SBInstruction.cpp
index 6755089af39a4..5921511f3b239 100644
--- a/lldb/source/API/SBInstruction.cpp
+++ b/lldb/source/API/SBInstruction.cpp
@@ -10,8 +10,8 @@
 #include "lldb/Utility/Instrumentation.h"
 
 #include "lldb/API/SBAddress.h"
-#include "lldb/API/SBFrame.h"
 #include "lldb/API/SBFile.h"
+#include "lldb/API/SBFrame.h"
 
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBTarget.h"
@@ -268,7 +268,8 @@ bool SBInstruction::GetDescription(lldb::SBStream &s) {
 
 void SBInstruction::Print(FILE *outp) {
   LLDB_INSTRUMENT_VA(this, outp);
-  FileSP out = std::make_shared<NativeFile>(outp, /*take_ownership=*/false);
+  FileSP out = std::make_shared<NativeFile>(outp, File::eOpenOptionWriteOnly,
+                                            /*take_ownership=*/false);
   Print(out);
 }
 
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index d4be64b815369..14aa9432eed83 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBProcess.h"
+#include "lldb/Host/File.h"
 #include "lldb/Utility/Instrumentation.h"
 
 #include <cinttypes>
@@ -310,7 +311,8 @@ void SBProcess::ReportEventState(const SBEvent &event, SBFile out) const {
 
 void SBProcess::ReportEventState(const SBEvent &event, FILE *out) const {
   LLDB_INSTRUMENT_VA(this, event, out);
-  FileSP outfile = std::make_shared<NativeFile>(out, false);
+  FileSP outfile =
+      std::make_shared<NativeFile>(out, File::eOpenOptionWriteOnly, false);
   return ReportEventState(event, outfile);
 }
 
diff --git a/lldb/source/API/SBStream.cpp b/lldb/source/API/SBStream.cpp
index fc8f09a7bb9ae..2fc5fcfa8b0c4 100644
--- a/lldb/source/API/SBStream.cpp
+++ b/lldb/source/API/SBStream.cpp
@@ -116,7 +116,8 @@ void SBStream::RedirectToFile(const char *path, bool append) {
 
 void SBStream::RedirectToFileHandle(FILE *fh, bool transfer_fh_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_fh_ownership);
-  FileSP file = std::make_unique<NativeFile>(fh, transfer_fh_ownership);
+  FileSP file = std::make_unique<NativeFile>(fh, File::eOpenOptionReadWrite,
+                                             transfer_fh_ownership);
   return RedirectToFile(file);
 }
 
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index b37d9d3ed85e3..02f38e9094ec5 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -965,7 +965,8 @@ llvm::StringRef Debugger::GetStaticBroadcasterClass() {
 Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
     : UserID(g_unique_id++),
       Properties(std::make_shared<OptionValueProperties>()),
-      m_input_file_sp(std::make_shared<NativeFile>(stdin, NativeFile::Unowned)),
+      m_input_file_sp(std::make_shared<NativeFile>(
+          stdin, File::eOpenOptionReadOnly, NativeFile::Unowned)),
       m_output_stream_sp(std::make_shared<LockableStreamFile>(
           stdout, NativeFile::Unowned, m_output_mutex)),
       m_error_stream_sp(std::make_shared<LockableStreamFile>(
@@ -1172,7 +1173,8 @@ Status Debugger::SetInputString(const char *data) {
     return result;
   }
 
-  SetInputFile((FileSP)std::make_shared<NativeFile>(commands_file, true));
+  SetInputFile((FileSP)std::make_shared<NativeFile>(
+      commands_file, File::eOpenOptionReadOnly, true));
   return result;
 }
 
@@ -1378,7 +1380,8 @@ void Debugger::AdoptTopIOHandlerFilesIfInvalid(FileSP &in,
       in = GetInputFileSP();
     // If there is nothing, use stdin
     if (!in)
-      in = std::make_shared<NativeFile>(stdin, NativeFile::Unowned);
+      in = std::make_shared<NativeFile>(stdin, File::eOpenOptionReadOnly,
+                                        NativeFile::Unowned);
   }
   // If no STDOUT has been set, then set it appropriately
   if (!out || !out->GetUnlockedFile().IsValid()) {
diff --git a/lldb/source/Host/common/File.cpp b/lldb/source/Host/common/File.cpp
index 65b75bd647c5d..4fad93fca9ea3 100644
--- a/lldb/source/Host/common/File.cpp
+++ b/lldb/source/Host/common/File.cpp
@@ -249,8 +249,8 @@ uint32_t File::GetPermissions(Status &error) const {
 
 NativeFile::NativeFile() = default;
 
-NativeFile::NativeFile(FILE *fh, bool transfer_ownership)
-    : m_stream(fh), m_own_stream(transfer_ownership) {
+NativeFile::NativeFile(FILE *fh, OpenOptions options, bool transfer_ownership)
+    : m_stream(fh), m_options(options), m_own_stream(transfer_ownership) {
 #ifdef _WIN32
   // In order to properly display non ASCII characters in Windows, we need to
   // use Windows APIs to print to the console. This is only required if the
@@ -258,6 +258,26 @@ NativeFile::NativeFile(FILE *fh, bool transfer_ownership)
   int fd = _fileno(fh);
   is_windows_console =
       ::GetFileType((HANDLE)::_get_osfhandle(fd)) == FILE_TYPE_CHAR;
+#else
+#ifndef NDEBUG
+  int fd = fileno(fh);
+  if (fd != -1) {
+    int required_mode = ConvertOpenOptionsForPOSIXOpen(options) & O_ACCMODE;
+    int mode = fcntl(fd, F_GETFL);
+    if (mode != -1) {
+      mode &= O_ACCMODE;
+      // Check that the file is open with a valid subset of the requested file
+      // access mode, e.g. if we expected the file to be writable then ensure it
+      // was opened with O_WRONLY or O_RDWR.
+      assert(
+          (required_mode == O_RDWR && mode == O_RDWR) ||
+          (required_mode == O_RDONLY && (mode == O_RDWR || mode == O_RDONLY) ||
+           (required_mode == O_WRONLY &&
+            (mode == O_RDWR || mode == O_WRONLY))) &&
+              "invalid file access mode");
+    }
+  }
+#endif
 #endif
 }
 
@@ -274,7 +294,8 @@ NativeFile::NativeFile(int fd, OpenOptions options, bool transfer_ownership)
 }
 
 bool NativeFile::IsValid() const {
-  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex, m_stream_mutex);
+  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex,
+                                                m_stream_mutex);
   return DescriptorIsValidUnlocked() || StreamIsValidUnlocked();
 }
 
@@ -343,7 +364,8 @@ FILE *NativeFile::GetStream() {
 }
 
 Status NativeFile::Close() {
-  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex, m_stream_mutex);
+  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex,
+                                                m_stream_mutex);
 
   Status error;
 
@@ -548,6 +570,10 @@ Status NativeFile::Sync() {
 Status NativeFile::Read(void *buf, size_t &num_bytes) {
   Status error;
 
+  // Ensure the file is open for reading.
+  if ((m_options & File::OpenOptionsModeMask) == eOpenOptionWriteOnly)
+    return Status(std::make_error_code(std::errc::bad_file_descriptor));
+
 #if defined(MAX_READ_SIZE)
   if (num_bytes > MAX_READ_SIZE) {
     uint8_t *p = (uint8_t *)buf;
@@ -612,6 +638,10 @@ Status NativeFile::Read(void *buf, size_t &num_bytes) {
 Status NativeFile::Write(const void *buf, size_t &num_bytes) {
   Status error;
 
+  // Ensure the file is open for writing.
+  if ((m_options & File::OpenOptionsModeMask) == File::eOpenOptionReadOnly)
+    return Status(std::make_error_code(std::errc::bad_file_descriptor));
+
 #if defined(MAX_WRITE_SIZE)
   if (num_bytes > MAX_WRITE_SIZE) {
     const uint8_t *p = (const uint8_t *)buf;
@@ -776,8 +806,8 @@ Status NativeFile::Write(const void *buf, size_t &num_bytes, off_t &offset) {
   int fd = GetDescriptor();
   if (fd != kInvalidDescriptor) {
 #ifndef _WIN32
-    ssize_t bytes_written =
-        llvm::sys::RetryAfterSignal(-1, ::pwrite, m_descriptor, buf, num_bytes, offset);
+    ssize_t bytes_written = llvm::sys::RetryAfterSignal(
+        -1, ::pwrite, m_descriptor, buf, num_bytes, offset);
     if (bytes_written < 0) {
       num_bytes = 0;
       error = Status::FromErrno();
diff --git a/lldb/source/Host/common/StreamFile.cpp b/lldb/source/Host/common/StreamFile.cpp
index 099980a0993c6..131412d81983b 100644
--- a/lldb/source/Host/common/StreamFile.cpp
+++ b/lldb/source/Host/common/StreamFile.cpp
@@ -27,7 +27,8 @@ StreamFile::StreamFile(int fd, bool transfer_ownership) : Stream() {
 }
 
 StreamFile::StreamFile(FILE *fh, bool transfer_ownership) : Stream() {
-  m_file_sp = std::make_shared<NativeFile>(fh, transfer_ownership);
+  m_file_sp = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                           transfer_ownership);
 }
 
 StreamFile::StreamFile(const char *path, File::OpenOptions options,
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 3493fa9fef635..35a772c1454df 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -272,6 +272,7 @@ void ScriptInterpreterPython::SharedLibraryDirectoryHelper(
   // does.
   if (this_file.GetFileNameExtension() == ".pyd") {
     this_file.RemoveLastPathComponent(); // _lldb.pyd or _lldb_d.pyd
+    this_file.RemoveLastPathComponent(); // native
     this_file.RemoveLastPathComponent(); // lldb
     llvm::StringRef libdir = LLDB_PYTHON_RELATIVE_LIBDIR;
     for (auto it = llvm::sys::path::begin(libdir),
diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 6080703998ff2..9964ae492bc00 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -722,15 +722,11 @@ Symtab::AppendSymbolIndexesWithNameAndType(ConstString symbol_name,
                                            std::vector<uint32_t> &indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  if (AppendSymbolIndexesWithName(symbol_name, indexes) > 0) {
-    std::vector<uint32_t>::iterator pos = indexes.begin();
-    while (pos != indexes.end()) {
-      if (symbol_type == eSymbolTypeAny ||
-          m_symbols[*pos].GetType() == symbol_type)
-        ++pos;
-      else
-        pos = indexes.erase(pos);
-    }
+  if (AppendSymbolIndexesWithName(symbol_name, indexes) > 0 &&
+      symbol_type != eSymbolTypeAny) {
+    llvm::erase_if(indexes, [this, symbol_type](uint32_t index) {
+      return m_symbols[index].GetType() != symbol_type;
+    });
   }
   return indexes.size();
 }
@@ -742,15 +738,11 @@ uint32_t Symtab::AppendSymbolIndexesWithNameAndType(
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
   if (AppendSymbolIndexesWithName(symbol_name, symbol_debug_type,
-                                  symbol_visibility, indexes) > 0) {
-    std::vector<uint32_t>::iterator pos = indexes.begin();
-    while (pos != indexes.end()) {
-      if (symbol_type == eSymbolTypeAny ||
-          m_symbols[*pos].GetType() == symbol_type)
-        ++pos;
-      else
-        pos = indexes.erase(pos);
-    }
+                                  symbol_visibility, indexes) > 0 &&
+      symbol_type != eSymbolTypeAny) {
+    llvm::erase_if(indexes, [this, symbol_type](uint32_t index) {
+      return m_symbols[index].GetType() != symbol_type;
+    });
   }
   return indexes.size();
 }
diff --git a/lldb/test/API/commands/target/auto-install-main-executable/Makefile b/lldb/test/API/commands/target/auto-install-main-executable/Makefile
index 07e6c9a1d0f15..d0578fb699d1b 100644
--- a/lldb/test/API/commands/target/auto-install-main-executable/Makefile
+++ b/lldb/test/API/commands/target/auto-install-main-executable/Makefile
@@ -6,4 +6,4 @@ a.out: a.device.out
 include Makefile.rules
 
 a.device.out:
-	$(CXX) $(CXXFLAGS) -DBUILD=74 -o $@ $(SRCDIR)/main.cpp
+	$(CXX) $(ASAN_LDFLAGS) $(CXXFLAGS) -DBUILD=74 -o $@ $(SRCDIR)/main.cpp
diff --git a/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile b/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
index 12781fd847768..f13584041fb51 100644
--- a/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
+++ b/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
@@ -5,7 +5,7 @@ all: clean $(EXE)
 include Makefile.rules
 
 $(EXE):
-	$(CC) $(CFLAGS) -dynamiclib -o com.apple.sbd $(SRCDIR)/bundle.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -dynamiclib -o com.apple.sbd $(SRCDIR)/bundle.c
 	mkdir com.apple.sbd.xpc
 	mv com.apple.sbd com.apple.sbd.xpc/
 	mkdir -p com.apple.sbd.xpc.dSYM/Contents/Resources/DWARF
@@ -13,7 +13,7 @@ $(EXE):
 	rm -rf com.apple.sbd.dSYM
 	mkdir hide.app
 	tar cf - com.apple.sbd.xpc com.apple.sbd.xpc.dSYM | ( cd hide.app;tar xBpf -)
-	$(CC) $(CFLAGS) -o find-bundle-with-dots-in-fn $(SRCDIR)/main.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -o find-bundle-with-dots-in-fn $(SRCDIR)/main.c
 
 clean::
 	rm -rf a.out a.out.dSYM hide.app com.apple.sbd com.apple.sbd.dSYM com.apple.sbd.xpc com.apple.sbd.xpc.dSYM find-bundle-with-dots-in-fn find-bundle-with-dots-in-fn.dSYM
diff --git a/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile b/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
index 806c840c9f2ee..c041d9e7a0e95 100644
--- a/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
+++ b/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
@@ -4,7 +4,7 @@ all: clean $(EXE)
 include Makefile.rules
 
 $(EXE):
-	$(CC) $(CFLAGS) -install_name $(shell pwd)/MyFramework.framework/Versions/A/MyFramework -dynamiclib -o MyFramework $(SRCDIR)/myframework.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -install_name $(shell pwd)/MyFramework.framework/Versions/A/MyFramework -dynamiclib -o MyFramework $(SRCDIR)/myframework.c
 	mkdir -p MyFramework.framework/Versions/A/Headers
 	mkdir -p MyFramework.framework/Versions/A/Resources
 	cp MyFramework MyFramework.framework/Versions/A
@@ -18,7 +18,7 @@ $(EXE):
 	mkdir hide.app
 	rm -f MyFramework
 	tar cf - MyFramework.framework MyFramework.framework.dSYM | ( cd hide.app;tar xBpf -)
-	$(CC) $(CFLAGS) -o deep-bundle $(SRCDIR)/main.c -F. -framework MyFramework
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -o deep-bundle $(SRCDIR)/main.c -F. -framework MyFramework
 
 clean::
 	rm -rf a.out a.out.dSYM deep-bundle deep-bundle.dSYM MyFramework.framework MyFramework.framework.dSYM MyFramework MyFramework.dSYM hide.app
diff --git a/lldb/test/API/macosx/posix_spawn/Makefile b/lldb/test/API/macosx/posix_spawn/Makefile
index 7ae46ca95828d..cbdee9122e3f2 100644
--- a/lldb/test/API/macosx/posix_spawn/Makefile
+++ b/lldb/test/API/macosx/posix_spawn/Makefile
@@ -6,13 +6,13 @@ include Makefile.rules
 all: fat.out
 
 x86_64.out: x86_64.c
-	$(CC) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o x86_64.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o x86_64.out $<
 
 x86_64h.out: x86_64h.c
-	$(CC) -isysroot $(SDKROOT) -target x86_64h-apple-macosx10.9 -o x86_64h.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target x86_64h-apple-macosx10.9 -o x86_64h.out $<
 
 arm64.out: arm64.c
-	$(CC) -isysroot $(SDKROOT) -target arm64-apple-macosx10.9 -o arm64.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target arm64-apple-macosx10.9 -o arm64.out $<
 
 fat.out: x86_64.out x86_64h.out arm64.out
 	$(LIPO) -o fat.out -create $^
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 20a75f4076e42..3c233a5b43ebb 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -1,5 +1,5 @@
 """
-Test lldb-dap completions request
+Test lldb-dap evaluate request
 """
 
 import re
@@ -7,16 +7,67 @@
 import lldbdap_testcase
 from lldbsuite.test.decorators import skipIfWindows
 from lldbsuite.test.lldbtest import line_number
+from typing import TypedDict, Optional
+
+
+class EvaluateResponseBody(TypedDict, total=False):
+    result: str
+    variablesReference: int
+    type: Optional[str]
+    memoryReference: Optional[str]
+    valueLocationReference: Optional[int]
 
 
 class TestDAP_evaluate(lldbdap_testcase.DAPTestCaseBase):
-    def assertEvaluate(self, expression, regex):
+    def assertEvaluate(
+        self,
+        expression,
+        result: str,
+        want_type="",
+        want_varref=False,
+        want_memref=True,
+        want_locref=False,
+    ):
+        resp = self.dap_server.request_evaluate(expression, context=self.context)
+        self.assertTrue(
+            resp["success"], f"Failed to evaluate expression {expression!r}"
+        )
+        body: EvaluateResponseBody = resp["body"]
         self.assertRegex(
-            self.dap_server.request_evaluate(expression, context=self.context)["body"][
-                "result"
-            ],
-            regex,
+            body["result"],
+            result,
+            f"Unexpected 'result' for expression {expression!r} in response body {body}",
         )
+        if want_varref:
+            self.assertNotEqual(
+                body["variablesReference"],
+                0,
+                f"Unexpected 'variablesReference' for expression {expression!r} in response body {body}",
+            )
+        else:
+            self.assertEqual(
+                body["variablesReference"],
+                0,
+                f"Unexpected 'variablesReference' for expression {expression!r} in response body {body}",
+            )
+        if want_type:
+            self.assertEqual(
+                body["type"],
+                want_type,
+                f"Unexpected 'type' for expression {expression!r} in response body {body}",
+            )
+        if want_memref:
+            self.assertIn(
+                "memoryReference",
+                body,
+                f"Unexpected 'memoryReference' for expression {expression!r} in response body {body}",
+            )
+        if want_locref:
+            self.assertIn(
+                "valueLocationReference",
+                body,
+                f"Unexpected 'valueLocationReference' for expression {expression!r} in response body {body}",
+            )
 
     def assertEvaluateFailure(self, expression):
         self.assertNotIn(
@@ -71,29 +122,39 @@ def run_test_evaluate_expressions(
         self.continue_to_breakpoint(breakpoint_1)
 
         # Expressions at breakpoint 1, which is in main
-        self.assertEvaluate("var1", "20")
+        self.assertEvaluate("var1", "20", want_type="int")
         # Empty expression should equate to the previous expression.
         if context == "repl":
             self.assertEvaluate("", "20")
         else:
             self.assertEvaluateFailure("")
-        self.assertEvaluate("var2", "21")
+        self.assertEvaluate("var2", "21", want_type="int")
         if context == "repl":
-            self.assertEvaluate("", "21")
-            self.assertEvaluate("", "21")
-        self.assertEvaluate("static_int", "42")
-        self.assertEvaluate("non_static_int", "43")
-        self.assertEvaluate("struct1.foo", "15")
-        self.assertEvaluate("struct2->foo", "16")
+            self.assertEvaluate("", "21", want_type="int")
+            self.assertEvaluate("", "21", want_type="int")
+        self.assertEvaluate("static_int", "42", want_type="int")
+        self.assertEvaluate("non_static_int", "43", want_type="int")
+        self.assertEvaluate("struct1.foo", "15", want_type="int")
+        self.assertEvaluate("struct2->foo", "16", want_type="int")
 
         if self.isResultExpandedDescription():
             self.assertEvaluate(
                 "struct1",
                 r"\(my_struct\) (struct1|\$\d+) = \(foo = 15\)",
+                want_type="my_struct",
+                want_varref=True,
+            )
+            self.assertEvaluate(
+                "struct2",
+                r"\(my_struct \*\) (struct2|\$\d+) = 0x.*",
+                want_type="my_struct *",
+                want_varref=True,
             )
-            self.assertEvaluate("struct2", r"\(my_struct \*\) (struct2|\$\d+) = 0x.*")
             self.assertEvaluate(
-                "struct3", r"\(my_struct \*\) (struct3|\$\d+) = nullptr"
+                "struct3",
+                r"\(my_struct \*\) (struct3|\$\d+) = nullptr",
+                want_type="my_struct *",
+                want_varref=True,
             )
         else:
             self.assertEvaluate(
@@ -103,16 +164,22 @@ def run_test_evaluate_expressions(
                     if enableAutoVariableSummaries
                     else "my_struct @ 0x"
                 ),
+                want_varref=True,
+            )
+            self.assertEvaluate(
+                "struct2",
+                "0x.* {foo:16}" if enableAutoVariableSummaries else "0x.*",
+                want_varref=True,
+                want_type="my_struct *",
             )
             self.assertEvaluate(
-                "struct2", "0x.* {foo:16}" if enableAutoVariableSummaries else "0x.*"
+                "struct3", "0x.*0", want_varref=True, want_type="my_struct *"
             )
-            self.assertEvaluate("struct3", "0x.*0")
 
         if context == "repl":
             # In the repl context expressions may be interpreted as lldb
             # commands since no variables have the same name as the command.
-            self.assertEvaluate("list", r".*")
+            self.assertEvaluate("list", r".*", want_memref=False)
         else:
             self.assertEvaluateFailure("list")  # local variable of a_function
 
@@ -121,10 +188,26 @@ def run_test_evaluate_expressions(
         self.assertEvaluateFailure("foo")  # member of my_struct
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("var2 + struct1.foo", "36")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_type="int (*)(int)",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate(
+                "a_function(1)", "1", want_memref=False, want_type="int"
+            )
+            self.assertEvaluate("var2 + struct1.foo", "36", want_memref=False)
+            self.assertEvaluate(
+                "foo_func",
+                "0x.*a.out`foo_func.*",
+                want_type="int (*)()",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -145,6 +228,8 @@ def run_test_evaluate_expressions(
             self.assertEvaluate(
                 "struct1",
                 r"\(my_struct\) (struct1|\$\d+) = \(foo = 15\)",
+                want_type="my_struct",
+                want_varref=True,
             )
         else:
             self.assertEvaluate(
@@ -154,15 +239,26 @@ def run_test_evaluate_expressions(
                     if enableAutoVariableSummaries
                     else "my_struct @ 0x"
                 ),
+                want_type="my_struct",
+                want_varref=True,
             )
         self.assertEvaluate("struct1.foo", "15")
         self.assertEvaluate("struct2->foo", "16")
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("var2 + struct1.foo", "17")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_type="int (*)(int)",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate("a_function(1)", "1", want_memref=False)
+            self.assertEvaluate("var2 + struct1.foo", "17", want_memref=False)
+            self.assertEvaluate(
+                "foo_func", "0x.*a.out`foo_func.*", want_varref=True, want_memref=False
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -185,10 +281,18 @@ def run_test_evaluate_expressions(
         self.assertEvaluateFailure("var2 + struct1.foo")
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("list + 1", "43")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate("a_function(1)", "1", want_memref=False)
+            self.assertEvaluate("list + 1", "43", want_memref=False)
+            self.assertEvaluate(
+                "foo_func", "0x.*a.out`foo_func.*", want_varref=True, want_memref=False
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -199,26 +303,28 @@ def run_test_evaluate_expressions(
 
         # Now we check that values are updated after stepping
         self.continue_to_breakpoint(breakpoint_4)
-        self.assertEvaluate("my_vec", "size=2")
+        self.assertEvaluate("my_vec", "size=2", want_varref=True)
         self.continue_to_breakpoint(breakpoint_5)
-        self.assertEvaluate("my_vec", "size=3")
+        self.assertEvaluate("my_vec", "size=3", want_varref=True)
 
-        self.assertEvaluate("my_map", "size=2")
+        self.assertEvaluate("my_map", "size=2", want_varref=True)
         self.continue_to_breakpoint(breakpoint_6)
-        self.assertEvaluate("my_map", "size=3")
+        self.assertEvaluate("my_map", "size=3", want_varref=True)
 
-        self.assertEvaluate("my_bool_vec", "size=1")
+        self.assertEvaluate("my_bool_vec", "size=1", want_varref=True)
         self.continue_to_breakpoint(breakpoint_7)
-        self.assertEvaluate("my_bool_vec", "size=2")
+        self.assertEvaluate("my_bool_vec", "size=2", want_varref=True)
 
         self.continue_to_breakpoint(breakpoint_8)
         # Test memory read, especially with 'empty' repeat commands.
         if context == "repl":
-            self.assertEvaluate("memory read -c 1 &my_ints", ".* 05 .*\n")
-            self.assertEvaluate("", ".* 0a .*\n")
-            self.assertEvaluate("", ".* 0f .*\n")
-            self.assertEvaluate("", ".* 14 .*\n")
-            self.assertEvaluate("", ".* 19 .*\n")
+            self.assertEvaluate(
+                "memory read -c 1 &my_ints", ".* 05 .*\n", want_memref=False
+            )
+            self.assertEvaluate("", ".* 0a .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 0f .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 14 .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 19 .*\n", want_memref=False)
 
         self.continue_to_exit()
 
@@ -245,4 +351,6 @@ def test_hover_evaluate_expressions(self):
     @skipIfWindows
     def test_variable_evaluate_expressions(self):
         # Tests expression evaluations that are triggered in the variable explorer
-        self.run_test_evaluate_expressions("variable", enableAutoVariableSummaries=True)
+        self.run_test_evaluate_expressions(
+            "variables", enableAutoVariableSummaries=True
+        )
diff --git a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
index e1556846dff19..ea8c3a2a4a296 100644
--- a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
@@ -10,148 +10,31 @@
 #include "EventHelper.h"
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
+#include "Protocol/ProtocolRequests.h"
+#include "Protocol/ProtocolTypes.h"
 #include "RequestHandler.h"
+#include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace lldb_dap;
+using namespace lldb_dap::protocol;
 
 namespace lldb_dap {
 
-//  "EvaluateRequest": {
-//    "allOf": [ { "$ref": "#/definitions/Request" }, {
-//      "type": "object",
-//      "description": "Evaluate request; value of command field is 'evaluate'.
-//                      Evaluates the given expression in the context of the
-//                      top most stack frame. The expression has access to any
-//                      variables and arguments that are in scope.",
-//      "properties": {
-//        "command": {
-//          "type": "string",
-//          "enum": [ "evaluate" ]
-//        },
-//        "arguments": {
-//          "$ref": "#/definitions/EvaluateArguments"
-//        }
-//      },
-//      "required": [ "command", "arguments"  ]
-//    }]
-//  },
-//  "EvaluateArguments": {
-//    "type": "object",
-//    "description": "Arguments for 'evaluate' request.",
-//    "properties": {
-//      "expression": {
-//        "type": "string",
-//        "description": "The expression to evaluate."
-//      },
-//      "frameId": {
-//        "type": "integer",
-//        "description": "Evaluate the expression in the scope of this stack
-//                        frame. If not specified, the expression is evaluated
-//                        in the global scope."
-//      },
-//      "context": {
-//        "type": "string",
-//        "_enum": [ "watch", "repl", "hover" ],
-//        "enumDescriptions": [
-//          "evaluate is run in a watch.",
-//          "evaluate is run from REPL console.",
-//          "evaluate is run from a data hover."
-//        ],
-//        "description": "The context in which the evaluate request is run."
-//      },
-//      "format": {
-//        "$ref": "#/definitions/ValueFormat",
-//        "description": "Specifies details on how to format the Evaluate
-//                        result."
-//      }
-//    },
-//    "required": [ "expression" ]
-//  },
-//  "EvaluateResponse": {
-//    "allOf": [ { "$ref": "#/definitions/Response" }, {
-//      "type": "object",
-//      "description": "Response to 'evaluate' request.",
-//      "properties": {
-//        "body": {
-//          "type": "object",
-//          "properties": {
-//            "result": {
-//              "type": "string",
-//              "description": "The result of the evaluate request."
-//            },
-//            "type": {
-//              "type": "string",
-//              "description": "The optional type of the evaluate result."
-//            },
-//            "presentationHint": {
-//              "$ref": "#/definitions/VariablePresentationHint",
-//              "description": "Properties of a evaluate result that can be
-//                              used to determine how to render the result in
-//                              the UI."
-//            },
-//            "variablesReference": {
-//              "type": "number",
-//              "description": "If variablesReference is > 0, the evaluate
-//                              result is structured and its children can be
-//                              retrieved by passing variablesReference to the
-//                              VariablesRequest."
-//            },
-//            "namedVariables": {
-//              "type": "number",
-//              "description": "The number of named child variables. The
-//                              client can use this optional information to
-//                              present the variables in a paged UI and fetch
-//                              them in chunks."
-//            },
-//            "indexedVariables": {
-//              "type": "number",
-//              "description": "The number of indexed child variables. The
-//                              client can use this optional information to
-//                              present the variables in a paged UI and fetch
-//                              them in chunks."
-//            },
-//            "valueLocationReference": {
-//              "type": "integer",
-//              "description": "A reference that allows the client to request
-//                              the location where the returned value is
-//                              declared. For example, if a function pointer is
-//                              returned, the adapter may be able to look up the
-//                              function's location. This should be present only
-//                              if the adapter is likely to be able to resolve
-//                              the location.\n\nThis reference shares the same
-//                              lifetime as the `variablesReference`. See
-//                              'Lifetime of Object References' in the
-//              Overview section for details."
-//            }
-//            "memoryReference": {
-//               "type": "string",
-//                "description": "A memory reference to a location appropriate
-//                                for this result. For pointer type eval
-//                                results, this is generally a reference to the
-//                                memory address contained in the pointer. This
-//                                attribute may be returned by a debug adapter
-//                                if corresponding capability
-//                                `supportsMemoryReferences` is true."
-//             },
-//          },
-//          "required": [ "result", "variablesReference" ]
-//        }
-//      },
-//      "required": [ "body" ]
-//    }]
-//  }
-void EvaluateRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  llvm::json::Object body;
-  const auto *arguments = request.getObject("arguments");
-  lldb::SBFrame frame = dap.GetLLDBFrame(*arguments);
-  std::string expression =
-      GetString(arguments, "expression").value_or("").str();
-  const llvm::StringRef context = GetString(arguments, "context").value_or("");
+/// Evaluates the given expression in the context of a stack frame.
+///
+/// The expression has access to any variables and arguments that are in scope.
+Expected<EvaluateResponseBody>
+EvaluateRequestHandler::Run(const EvaluateArguments &arguments) const {
+  EvaluateResponseBody body;
+  lldb::SBFrame frame = dap.GetLLDBFrame(arguments.frameId);
+  std::string expression = arguments.expression;
   bool repeat_last_command =
       expression.empty() && dap.last_nonempty_var_expression.empty();
 
-  if (context == "repl" &&
+  if (arguments.context == protocol::eEvaluateContextRepl &&
       (repeat_last_command ||
        (!expression.empty() &&
         dap.DetectReplMode(frame, expression, false) == ReplMode::Command))) {
@@ -165,70 +48,60 @@ void EvaluateRequestHandler::operator()(
     }
 
     bool required_command_failed = false;
-    std::string result = RunLLDBCommands(
+    body.result = RunLLDBCommands(
         dap.debugger, llvm::StringRef(), {expression}, required_command_failed,
         /*parse_command_directives=*/false, /*echo_commands=*/false);
+    return body;
+  }
 
-    EmplaceSafeString(body, "result", result);
-    body.try_emplace("variablesReference", (int64_t)0);
-  } else {
-    if (context == "repl") {
-      // If the expression is empty and the last expression was for a
-      // variable, set the expression to the previous expression (repeat the
-      // evaluation); otherwise save the current non-empty expression for the
-      // next (possibly empty) variable expression.
-      if (expression.empty())
-        expression = dap.last_nonempty_var_expression;
-      else
-        dap.last_nonempty_var_expression = expression;
-    }
-    // Always try to get the answer from the local variables if possible. If
-    // this fails, then if the context is not "hover", actually evaluate an
-    // expression using the expression parser.
-    //
-    // "frame variable" is more reliable than the expression parser in
-    // many cases and it is faster.
-    lldb::SBValue value = frame.GetValueForVariablePath(
-        expression.data(), lldb::eDynamicDontRunTarget);
-
-    // Freeze dry the value in case users expand it later in the debug console
-    if (value.GetError().Success() && context == "repl")
-      value = value.Persist();
-
-    if (value.GetError().Fail() && context != "hover")
-      value = frame.EvaluateExpression(expression.data());
-
-    if (value.GetError().Fail()) {
-      response["success"] = llvm::json::Value(false);
-      // This error object must live until we're done with the pointer returned
-      // by GetCString().
-      lldb::SBError error = value.GetError();
-      const char *error_cstr = error.GetCString();
-      if (error_cstr && error_cstr[0])
-        EmplaceSafeString(response, "message", error_cstr);
-      else
-        EmplaceSafeString(response, "message", "evaluate failed");
-    } else {
-      VariableDescription desc(value,
-                               dap.configuration.enableAutoVariableSummaries);
-      EmplaceSafeString(body, "result", desc.GetResult(context));
-      EmplaceSafeString(body, "type", desc.display_type_name);
-      int64_t var_ref = 0;
-      if (value.MightHaveChildren() || ValuePointsToCode(value))
-        var_ref = dap.variables.InsertVariable(
-            value, /*is_permanent=*/context == "repl");
-      if (value.MightHaveChildren())
-        body.try_emplace("variablesReference", var_ref);
-      else
-        body.try_emplace("variablesReference", (int64_t)0);
-      if (lldb::addr_t addr = value.GetLoadAddress();
-          addr != LLDB_INVALID_ADDRESS)
-        body.try_emplace("memoryReference", EncodeMemoryReference(addr));
-      if (ValuePointsToCode(value))
-        body.try_emplace("valueLocationReference", var_ref);
-    }
+  if (arguments.context == eEvaluateContextRepl) {
+    // If the expression is empty and the last expression was for a
+    // variable, set the expression to the previous expression (repeat the
+    // evaluation); otherwise save the current non-empty expression for the
+    // next (possibly empty) variable expression.
+    if (expression.empty())
+      expression = dap.last_nonempty_var_expression;
+    else
+      dap.last_nonempty_var_expression = expression;
   }
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  // Always try to get the answer from the local variables if possible. If
+  // this fails, then if the context is not "hover", actually evaluate an
+  // expression using the expression parser.
+  //
+  // "frame variable" is more reliable than the expression parser in
+  // many cases and it is faster.
+  lldb::SBValue value = frame.GetValueForVariablePath(
+      expression.data(), lldb::eDynamicDontRunTarget);
+
+  // Freeze dry the value in case users expand it later in the debug console
+  if (value.GetError().Success() && arguments.context == eEvaluateContextRepl)
+    value = value.Persist();
+
+  if (value.GetError().Fail() && arguments.context != eEvaluateContextHover)
+    value = frame.EvaluateExpression(expression.data());
+
+  if (value.GetError().Fail())
+    return ToError(value.GetError(), /*show_user=*/false);
+
+  VariableDescription desc(value,
+                           dap.configuration.enableAutoVariableSummaries);
+
+  body.result = desc.GetResult(arguments.context);
+  body.type = desc.display_type_name;
+
+  if (value.MightHaveChildren() || ValuePointsToCode(value))
+    body.variablesReference = dap.variables.InsertVariable(
+        value, /*is_permanent=*/arguments.context == eEvaluateContextRepl);
+
+  if (lldb::addr_t addr = value.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
+    body.memoryReference = EncodeMemoryReference(addr);
+
+  if (ValuePointsToCode(value) &&
+      body.variablesReference != LLDB_DAP_INVALID_VARRERF)
+    body.valueLocationReference = PackLocation(body.variablesReference, true);
+
+  return body;
 }
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index bc22133d92453..65a52075ebd79 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -292,11 +292,14 @@ class DisconnectRequestHandler
   Run(const std::optional<protocol::DisconnectArguments> &args) const override;
 };
 
-class EvaluateRequestHandler : public LegacyRequestHandler {
+class EvaluateRequestHandler
+    : public RequestHandler<protocol::EvaluateArguments,
+                            llvm::Expected<protocol::EvaluateResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "evaluate"; }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::EvaluateResponseBody>
+  Run(const protocol::EvaluateArguments &) const override;
   FeatureSet GetSupportedFeatures() const override {
     return {protocol::eAdapterFeatureEvaluateForHovers};
   }
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 1a3a6701b194d..81eadae03bb48 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -11,6 +11,7 @@
 #include "ExceptionBreakpoint.h"
 #include "LLDBUtils.h"
 #include "Protocol/ProtocolBase.h"
+#include "Protocol/ProtocolRequests.h"
 #include "ProtocolUtils.h"
 #include "lldb/API/SBAddress.h"
 #include "lldb/API/SBCompileUnit.h"
@@ -817,10 +818,10 @@ VariableDescription::VariableDescription(lldb::SBValue v,
   evaluate_name = llvm::StringRef(evaluateStream.GetData()).str();
 }
 
-std::string VariableDescription::GetResult(llvm::StringRef context) {
+std::string VariableDescription::GetResult(protocol::EvaluateContext context) {
   // In repl context, the results can be displayed as multiple lines so more
   // detailed descriptions can be returned.
-  if (context != "repl")
+  if (context != protocol::eEvaluateContextRepl)
     return display_value;
 
   if (!v.IsValid())
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 0c865a33a6ce4..329dc8ab02f99 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -10,7 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 
 #include "DAPForward.h"
-#include "Protocol/ProtocolTypes.h"
+#include "Protocol/ProtocolRequests.h"
 #include "lldb/API/SBCompileUnit.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBType.h"
@@ -28,7 +28,7 @@
 
 namespace lldb_dap {
 
-/// Emplace a StringRef in a json::Object after enusring that the
+/// Emplace a StringRef in a json::Object after ensuring that the
 /// string is valid UTF8. If not, first call llvm::json::fixUTF8
 /// before emplacing.
 ///
@@ -351,7 +351,7 @@ struct VariableDescription {
                       std::optional<std::string> custom_name = {});
 
   /// Returns a description of the value appropriate for the specified context.
-  std::string GetResult(llvm::StringRef context);
+  std::string GetResult(protocol::EvaluateContext context);
 };
 
 /// Does the given variable have an associated value location?
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 4db6caa1af38b..e2ba2ee64103d 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLDBUtils.h"
+#include "DAPError.h"
 #include "JSONUtils.h"
 #include "lldb/API/SBCommandInterpreter.h"
 #include "lldb/API/SBCommandReturnObject.h"
@@ -17,6 +18,7 @@
 #include "lldb/API/SBThread.h"
 #include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -214,13 +216,14 @@ GetStopDisassemblyDisplay(lldb::SBDebugger &debugger) {
   return result;
 }
 
-llvm::Error ToError(const lldb::SBError &error) {
+llvm::Error ToError(const lldb::SBError &error, bool show_user) {
   if (error.Success())
     return llvm::Error::success();
 
-  return llvm::createStringError(
-      std::error_code(error.GetError(), std::generic_category()),
-      error.GetCString());
+  return llvm::make_error<DAPError>(
+      /*message=*/error.GetCString(),
+      /*EC=*/std::error_code(error.GetError(), std::generic_category()),
+      /*show_user=*/show_user);
 }
 
 std::string GetStringValue(const lldb::SBStructuredData &data) {
diff --git a/lldb/tools/lldb-dap/LLDBUtils.h b/lldb/tools/lldb-dap/LLDBUtils.h
index 9db721a47ccf7..a29d3d88789a0 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.h
+++ b/lldb/tools/lldb-dap/LLDBUtils.h
@@ -243,7 +243,7 @@ class ScopeSyncMode {
 lldb::StopDisassemblyType GetStopDisassemblyDisplay(lldb::SBDebugger &debugger);
 
 /// Take ownership of the stored error.
-llvm::Error ToError(const lldb::SBError &error);
+llvm::Error ToError(const lldb::SBError &error, bool show_user = true);
 
 /// Provides the string value if this data structure is a string type.
 std::string GetStringValue(const lldb::SBStructuredData &data);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 44ae79f8b9f43..ac01cfb95dd41 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -8,6 +8,7 @@
 
 #include "Protocol/ProtocolRequests.h"
 #include "JSONUtils.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/lldb-defines.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
@@ -639,6 +640,54 @@ json::Value toJSON(const ExceptionInfoResponseBody &ERB) {
     result.insert({"description", ERB.description});
   if (ERB.details.has_value())
     result.insert({"details", *ERB.details});
+  return result;
+}
+
+static bool fromJSON(const llvm::json::Value &Params, EvaluateContext &C,
+                     llvm::json::Path P) {
+  auto rawContext = Params.getAsString();
+  if (!rawContext) {
+    P.report("expected a string");
+    return false;
+  }
+  C = StringSwitch<EvaluateContext>(*rawContext)
+          .Case("watch", EvaluateContext::eEvaluateContextWatch)
+          .Case("repl", EvaluateContext::eEvaluateContextRepl)
+          .Case("hover", EvaluateContext::eEvaluateContextHover)
+          .Case("clipboard", EvaluateContext::eEvaluateContextClipboard)
+          .Case("variables", EvaluateContext::eEvaluateContextVariables)
+          .Default(eEvaluateContextUnknown);
+  return true;
+}
+
+bool fromJSON(const llvm::json::Value &Params, EvaluateArguments &Args,
+              llvm::json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("expression", Args.expression) &&
+         O.mapOptional("frameId", Args.frameId) &&
+         O.mapOptional("line", Args.line) &&
+         O.mapOptional("column", Args.column) &&
+         O.mapOptional("source", Args.source) &&
+         O.mapOptional("context", Args.context) &&
+         O.mapOptional("format", Args.format);
+}
+
+llvm::json::Value toJSON(const EvaluateResponseBody &Body) {
+  json::Object result{{"result", Body.result},
+                      {"variablesReference", Body.variablesReference}};
+
+  if (!Body.type.empty())
+    result.insert({"type", Body.type});
+  if (Body.presentationHint)
+    result.insert({"presentationHint", Body.presentationHint});
+  if (Body.namedVariables)
+    result.insert({"namedVariables", Body.namedVariables});
+  if (Body.indexedVariables)
+    result.insert({"indexedVariables", Body.indexedVariables});
+  if (!Body.memoryReference.empty())
+    result.insert({"memoryReference", Body.memoryReference});
+  if (Body.valueLocationReference != LLDB_DAP_INVALID_VALUE_LOC)
+    result.insert({"valueLocationReference", Body.valueLocationReference});
 
   return result;
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index b894f2b4ed44d..c1e1e93f1e44a 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -1061,6 +1061,123 @@ struct ExceptionInfoResponseBody {
 };
 llvm::json::Value toJSON(const ExceptionInfoResponseBody &);
 
+/// The context in which the evaluate request is used.
+enum EvaluateContext : unsigned {
+  /// An unspecified or unknown evaluate context.
+  eEvaluateContextUnknown = 0,
+  /// 'watch': evaluate is called from a watch view context.
+  eEvaluateContextWatch = 1,
+  /// 'repl': evaluate is called from a REPL context.
+  eEvaluateContextRepl = 2,
+  /// 'hover': evaluate is called to generate the debug hover contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsEvaluateForHovers` is true.
+  eEvaluateContextHover = 3,
+  /// 'clipboard': evaluate is called to generate clipboard contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsClipboardContext` is true.
+  eEvaluateContextClipboard = 4,
+  /// 'variables': evaluate is called from a variables view context.
+  eEvaluateContextVariables = 5,
+};
+
+/// Arguments for `evaluate` request.
+struct EvaluateArguments {
+  /// The expression to evaluate.
+  std::string expression;
+
+  /// Evaluate the expression in the scope of this stack frame. If not
+  /// specified, the expression is evaluated in the global scope.
+  uint64_t frameId = LLDB_DAP_INVALID_FRAME_ID;
+
+  /// The contextual line where the expression should be evaluated. In the
+  /// 'hover' context, this should be set to the start of the expression being
+  /// hovered.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// The contextual column where the expression should be evaluated. This may
+  /// be provided if `line` is also provided.
+  ///
+  /// It is measured in UTF-16 code units and the client capability
+  /// `columnsStartAt1` determines whether it is 0- or 1-based.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The contextual source in which the `line` is found. This must be provided
+  /// if `line` is provided.
+  std::optional<Source> source;
+
+  /// The context in which the evaluate request is used.
+  /// Values:
+  /// 'watch': evaluate is called from a watch view context.
+  /// 'repl': evaluate is called from a REPL context.
+  /// 'hover': evaluate is called to generate the debug hover contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsEvaluateForHovers` is true.
+  /// 'clipboard': evaluate is called to generate clipboard contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsClipboardContext` is true.
+  /// 'variables': evaluate is called from a variables view context.
+  /// etc.
+  EvaluateContext context = eEvaluateContextUnknown;
+
+  /// Specifies details on how to format the result.
+  /// The attribute is only honored by a debug adapter if the corresponding
+  /// capability `supportsValueFormattingOptions` is true.
+  std::optional<ValueFormat> format;
+};
+bool fromJSON(const llvm::json::Value &, EvaluateArguments &, llvm::json::Path);
+
+/// Response to 'evaluate' request.
+struct EvaluateResponseBody {
+  /// The result of the evaluate request.
+  std::string result;
+
+  /// The type of the evaluate result.
+  /// This attribute should only be returned by a debug adapter if the
+  /// corresponding capability `supportsVariableType` is true.
+  std::string type;
+
+  /// Properties of an evaluate result that can be used to determine how to
+  /// render the result in the UI.
+  std::optional<VariablePresentationHint> presentationHint;
+
+  /// If `variablesReference` is > 0, the evaluate result is structured and its
+  /// children can be retrieved by passing `variablesReference` to the
+  /// `variables` request as long as execution remains suspended. See 'Lifetime
+  /// of Object References' in the Overview section for details.
+  int64_t variablesReference = 0;
+
+  /// The number of named child variables.
+  /// The client can use this information to present the variables in a paged
+  /// UI and fetch them in chunks.
+  /// The value should be less than or equal to 2147483647 (2^31-1).
+  uint32_t namedVariables = 0;
+
+  /// The number of indexed child variables.
+  /// The client can use this information to present the variables in a paged
+  /// UI and fetch them in chunks.
+  /// The value should be less than or equal to 2147483647 (2^31-1).
+  uint32_t indexedVariables = 0;
+
+  /// A memory reference to a location appropriate for this result.
+  /// For pointer type eval results, this is generally a reference to the
+  /// memory address contained in the pointer.
+  /// This attribute may be returned by a debug adapter if corresponding
+  /// capability `supportsMemoryReferences` is true.
+  std::string memoryReference;
+
+  /// A reference that allows the client to request the location where the
+  /// returned value is declared. For example, if a function pointer is
+  /// returned, the adapter may be able to look up the function's location.
+  /// This should be present only if the adapter is likely to be able to
+  /// resolve the location.
+  ///
+  /// This reference shares the same lifetime as the `variablesReference`. See
+  /// 'Lifetime of Object References' in the Overview section for details.
+  uint64_t valueLocationReference = LLDB_DAP_INVALID_VALUE_LOC;
+};
+llvm::json::Value toJSON(const EvaluateResponseBody &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index 6d85c74377bd3..690a1d684d0e9 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -28,8 +28,9 @@
 #include <optional>
 #include <string>
 
-#define LLDB_DAP_INVALID_VARRERF UINT64_MAX
+#define LLDB_DAP_INVALID_VARRERF INT64_MAX
 #define LLDB_DAP_INVALID_SRC_REF 0
+#define LLDB_DAP_INVALID_VALUE_LOC 0
 
 namespace lldb_dap::protocol {
 
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
index 498195dc09325..ba9aef1e5fcc5 100644
--- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -67,3 +67,54 @@ TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) {
   ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
   EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
 }
+
+TEST(ProtocolRequestsTest, EvaluateArguments) {
+  llvm::Expected<EvaluateArguments> expected = parse<EvaluateArguments>(R"({
+    "expression": "hello world",
+    "context": "repl"
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->expression, "hello world");
+  EXPECT_EQ(expected->context, eEvaluateContextRepl);
+
+  // Check required keys;
+  EXPECT_THAT_EXPECTED(parse<EvaluateArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).expression"));
+}
+
+TEST(ProtocolRequestsTest, EvaluateResponseBody) {
+  EvaluateResponseBody body;
+  body.result = "hello world";
+  body.variablesReference = 7;
+
+  // Check required keys.
+  Expected<json::Value> expected = parse(R"({
+    "result": "hello world",
+    "variablesReference": 7
+  })");
+
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
+
+  // Check optional keys.
+  body.result = "'abc'";
+  body.type = "string";
+  body.variablesReference = 42;
+  body.namedVariables = 1;
+  body.indexedVariables = 2;
+  body.memoryReference = "0x123";
+  body.valueLocationReference = 22;
+
+  Expected<json::Value> expected_opt = parse(R"({
+    "result": "'abc'",
+    "type": "string",
+    "variablesReference": 42,
+    "namedVariables": 1,
+    "indexedVariables": 2,
+    "memoryReference": "0x123",
+    "valueLocationReference": 22
+  })");
+
+  ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
+}
diff --git a/lldb/unittests/Host/FileTest.cpp b/lldb/unittests/Host/FileTest.cpp
index d973d19430596..85697c49f6fce 100644
--- a/lldb/unittests/Host/FileTest.cpp
+++ b/lldb/unittests/Host/FileTest.cpp
@@ -8,6 +8,7 @@
 
 #include "lldb/Host/File.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Path.h"
@@ -35,7 +36,7 @@ TEST(File, GetWaitableHandleFileno) {
   FILE *stream = fdopen(fd, "r");
   ASSERT_TRUE(stream);
 
-  NativeFile file(stream, true);
+  NativeFile file(stream, File::eOpenOptionReadWrite, true);
 #ifdef _WIN32
   EXPECT_EQ(file.GetWaitableHandle(), (HANDLE)_get_osfhandle(fd));
 #else
@@ -67,3 +68,22 @@ TEST(File, GetStreamFromDescriptor) {
   EXPECT_EQ(file.GetWaitableHandle(), (file_t)fd);
 #endif
 }
+
+TEST(File, ReadOnlyModeNotWritable) {
+  const auto *Info = testing::UnitTest::GetInstance()->current_test_info();
+  llvm::SmallString<128> name;
+  int fd;
+  llvm::sys::fs::createTemporaryFile(llvm::Twine(Info->test_case_name()) + "-" +
+                                         Info->name(),
+                                     "test", fd, name);
+
+  llvm::FileRemover remover(name);
+  ASSERT_GE(fd, 0);
+
+  NativeFile file(fd, File::eOpenOptionReadOnly, true);
+  ASSERT_TRUE(file.IsValid());
+  llvm::StringLiteral buf = "Hello World";
+  size_t bytes_written = buf.size();
+  Status error = file.Write(buf.data(), bytes_written);
+  EXPECT_EQ(error.Fail(), true);
+}
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 6a4610397967a..9bbc75ff7700c 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -450,13 +450,16 @@ if( LLVM_ENABLE_PIC )
     # Enable interprocedural optimizations for non-inline functions which would
     # otherwise be disabled due to GCC -fPIC's default.
     # Note: GCC<10.3 has a bug on SystemZ.
-    #
+    # Note: Default on AIX is "no semantic interposition".
     # Note: Clang allows IPO for -fPIC so this optimization is less effective.
     # Clang 13 has a bug related to -fsanitize-coverage
     # -fno-semantic-interposition (https://reviews.llvm.org/D117183).
-    if ((CMAKE_COMPILER_IS_GNUCXX AND
-         NOT (LLVM_NATIVE_ARCH STREQUAL "SystemZ" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.3))
-       OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 14))
+    if ((NOT ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX"))
+        AND ((CMAKE_COMPILER_IS_GNUCXX AND
+              NOT (LLVM_NATIVE_ARCH STREQUAL "SystemZ"
+                   AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.3))
+             OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"
+                 AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 14)))
       add_flag_if_supported("-fno-semantic-interposition" FNO_SEMANTIC_INTERPOSITION)
     endif()
   endif()
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 511cb56f73dcb..557dbf8c7ca39 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -903,6 +903,11 @@ template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_Srl(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R);
 }
+template <typename LHS, typename RHS>
+inline auto m_ExactSr(const LHS &L, const RHS &R) {
+  return m_AnyOf(BinaryOpc_match<LHS, RHS>(ISD::SRA, L, R, SDNodeFlags::Exact),
+                 BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R, SDNodeFlags::Exact));
+}
 
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_Rotl(const LHS &L, const RHS &R) {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 7c97afd5a7f5a..59848c582f6d5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -201,7 +201,7 @@ enum class OMPDynGroupprivateFallbackType : uint64_t {
 };
 
 // Default OpenMP mapper name suffix.
-inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper";
+inline constexpr const char *OmpDefaultMapperName = "_omp_default_mapper";
 
 /// Values for bit flags used to specify the mapping type for
 /// offloading.
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 9577d0141f168..c91fc254ebe11 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -30,6 +30,8 @@ class LLVMContext;
 class Module;
 class AttributeList;
 class AttributeSet;
+class raw_ostream;
+class Constant;
 
 /// This namespace contains an enum with a value for every intrinsic/builtin
 /// function known by LLVM. The enum values are returned by
@@ -81,6 +83,9 @@ namespace Intrinsic {
   /// Returns true if the intrinsic can be overloaded.
   LLVM_ABI bool isOverloaded(ID id);
 
+  /// Returns true if the intrinsic has pretty printed immediate arguments.
+  LLVM_ABI bool hasPrettyPrintedArgs(ID id);
+
   /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a
   /// certain target. If it is a generic intrinsic false is returned.
   LLVM_ABI bool isTargetIntrinsic(ID IID);
@@ -284,6 +289,10 @@ namespace Intrinsic {
   /// N.
   LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
 
+  /// Print the argument info for the arguments with ArgInfo.
+  LLVM_ABI void printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS,
+                            const Constant *ImmArgVal);
+
   } // namespace Intrinsic
 
   } // namespace llvm
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 9413c3a4d5b32..adec819432534 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -142,6 +142,25 @@ class Range<AttrIndex idx, int lower, int upper> : IntrinsicProperty {
   int Upper = upper;
 }
 
+// ArgProperty - Base class for argument properties that can be specified in ArgInfo.
+class ArgProperty;
+
+// ArgName - Specifies the name of an argument for pretty-printing.
+class ArgName<string name> : ArgProperty {
+  string Name = name;
+}
+
+// ImmArgPrinter - Specifies a custom printer function for immediate arguments.
+class ImmArgPrinter<string funcname> : ArgProperty {
+  string FuncName = funcname;
+}
+
+// ArgInfo - The specified argument has properties defined by a list of ArgProperty objects.
+class ArgInfo<ArgIndex idx, list<ArgProperty> arg_properties> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+  list<ArgProperty> Properties = arg_properties;
+}
+
 def IntrNoReturn : IntrinsicProperty;
 
 // Applied by default.
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 21badc2692037..1b485dc8ccd1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2955,7 +2955,14 @@ foreach sp = [0, 1] in {
         defvar nargs = !size(args);
         defvar scale_d_imm = ArgIndex<!sub(nargs, 1)>;
         defvar scale_d_imm_range = [ImmArg<scale_d_imm>, Range<scale_d_imm, 0, 16>];
-        defvar intrinsic_properties = !listconcat(
+
+        // Check if this is the specific llvm.nvvm.tcgen05.mma.tensor intrinsic.
+        defvar is_target_intrinsic = !and(!eq(sp, 0), 
+                                          !eq(space, "tensor"), 
+                                          !eq(scale_d, 0), 
+                                          !eq(ashift, 0));
+
+        defvar base_properties = !listconcat(
           mma.common_intr_props,
           !if(!eq(scale_d, 1), scale_d_imm_range, []),
           [Range<ArgIndex<nargs>, 0, !if(!eq(scale_d, 1), 2, 4)>, // kind
@@ -2965,6 +2972,13 @@ foreach sp = [0, 1] in {
           ]
         );
 
+        defvar intrinsic_properties = !if(is_target_intrinsic, 
+          !listconcat(base_properties,
+            [ArgInfo<ArgIndex<nargs>, [ArgName<"kind">, ImmArgPrinter<"printTcgen05MMAKind">]>,
+             ArgInfo<ArgIndex<!add(nargs, 1)>, [ArgName<"cta_group">]>,
+             ArgInfo<ArgIndex<!add(nargs, 2)>, [ArgName<"collector">, ImmArgPrinter<"printTcgen05CollectorUsageOp">]>]),
+          base_properties);
+
         def mma.record_name:
               DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties,
                 mma.intr_name>;
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index d55100e5e709d..d383769043605 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -18,8 +18,11 @@
 #include <stdint.h>
 
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace nvvm {
@@ -659,6 +662,51 @@ inline APFloat::roundingMode GetFMARoundingMode(Intrinsic::ID IntrinsicID) {
   llvm_unreachable("Invalid FP instrinsic rounding mode for NVVM fma");
 }
 
+inline void printTcgen05MMAKind(raw_ostream &OS, const Constant *ImmArgVal) {
+  if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+    uint64_t Val = CI->getZExtValue();
+    switch (static_cast<Tcgen05MMAKind>(Val)) {
+    case Tcgen05MMAKind::F16:
+      OS << "f16";
+      return;
+    case Tcgen05MMAKind::TF32:
+      OS << "tf32";
+      return;
+    case Tcgen05MMAKind::F8F6F4:
+      OS << "f8f6f4";
+      return;
+    case Tcgen05MMAKind::I8:
+      OS << "i8";
+      return;
+    }
+  }
+  llvm_unreachable(
+      "printTcgen05MMAKind called with invalid value for immediate argument");
+}
+
+inline void printTcgen05CollectorUsageOp(raw_ostream &OS,
+                                         const Constant *ImmArgVal) {
+  if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+    uint64_t Val = CI->getZExtValue();
+    switch (static_cast<Tcgen05CollectorUsageOp>(Val)) {
+    case Tcgen05CollectorUsageOp::DISCARD:
+      OS << "discard";
+      return;
+    case Tcgen05CollectorUsageOp::LASTUSE:
+      OS << "lastuse";
+      return;
+    case Tcgen05CollectorUsageOp::FILL:
+      OS << "fill";
+      return;
+    case Tcgen05CollectorUsageOp::USE:
+      OS << "use";
+      return;
+    }
+  }
+  llvm_unreachable("printTcgen05CollectorUsageOp called with invalid value for "
+                   "immediate argument");
+}
+
 } // namespace nvvm
 } // namespace llvm
 #endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index 6e36e580358e7..f4897b6a406fb 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -438,7 +438,7 @@ class LLVM_ABI MCRegisterInfo {
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
-  virtual int64_t getDwarfRegNum(MCRegister RegNum, bool isEH) const;
+  virtual int64_t getDwarfRegNum(MCRegister Reg, bool isEH) const;
 
   /// Map a dwarf register back to a target register. Returns std::nullopt if
   /// there is no mapping.
@@ -450,11 +450,11 @@ class LLVM_ABI MCRegisterInfo {
 
   /// Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
-  int getSEHRegNum(MCRegister RegNum) const;
+  int getSEHRegNum(MCRegister Reg) const;
 
   /// Map a target register to an equivalent CodeView register
   /// number.
-  int getCodeViewRegNum(MCRegister RegNum) const;
+  int getCodeViewRegNum(MCRegister Reg) const;
 
   regclass_iterator regclass_begin() const { return Classes; }
   regclass_iterator regclass_end() const { return Classes+NumClasses; }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 530fa9518f40e..a3e9b039f9225 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -317,9 +317,9 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
 
   if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert to a variable element, we don't know what it is.
-    if (!isa<ConstantInt>(III->getOperand(2)))
+    uint64_t IIElt;
+    if (!match(III->getOperand(2), m_ConstantInt(IIElt)))
       return nullptr;
-    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
 
     // If this is an insert to the element we are looking for, return the
     // inserted value.
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 32b6c46303828..34531dd7ab17f 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -133,10 +133,6 @@ INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
 // Common ML Advisor declarations
 // ===================================
 namespace {
-// The model can only accept a specified number of opcodes and will error it if
-// fed an opcode it hasn't seen before. This constant sets the current cutoff.
-static const int OpcodeValueCutoff = 17716;
-
 // Most features are as described above, so we'll reuse this vector in defining
 // them.
 static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
@@ -948,139 +944,6 @@ void MLEvictAdvisor::extractFeatures(
 #undef SET
 }
 
-void llvm::extractInstructionFeatures(
-    SmallVectorImpl<LRStartEndInfo> &LRPosInfo, MLModelRunner *RegallocRunner,
-    function_ref<int(SlotIndex)> GetOpcode,
-    function_ref<float(SlotIndex)> GetMBBFreq,
-    function_ref<MachineBasicBlock *(SlotIndex)> GetMBBReference,
-    const int InstructionsIndex, const int InstructionsMappingIndex,
-    const int MBBFreqIndex, const int MBBMappingIndex,
-    const SlotIndex LastIndex) {
-  // This function extracts instruction based features relevant to the eviction
-  // problem currently being solved. This function ends up extracting two
-  // tensors.
-  // 1 - A vector of size max instruction count. It contains the opcodes of the
-  // instructions spanned by all the intervals in the current instance of the
-  // eviction problem.
-  // 2 - A binary mapping matrix of size (LR count * max
-  // instruction count) which maps where the LRs are live to the actual opcodes
-  // for which they are live.
-  // 3 - A vector of size max supported MBB count storing MBB frequencies,
-  // encompassing all of the MBBs covered by the eviction problem.
-  // 4 - A vector of size max instruction count of indices to members of the MBB
-  // frequency vector, mapping each instruction to its associated MBB.
-
-  // Start off by sorting the segments based on the beginning slot index.
-  std::sort(
-      LRPosInfo.begin(), LRPosInfo.end(),
-      [](LRStartEndInfo A, LRStartEndInfo B) { return A.Begin < B.Begin; });
-  size_t InstructionIndex = 0;
-  size_t CurrentSegmentIndex = 0;
-  SlotIndex CurrentIndex = LRPosInfo[0].Begin;
-  std::map<MachineBasicBlock *, size_t> VisitedMBBs;
-  size_t CurrentMBBIndex = 0;
-  // This loop processes all the segments sequentially by starting at the
-  // beginning slot index of the first segment, iterating through all the slot
-  // indices before the end slot index of that segment (while checking for
-  // overlaps with segments that start at greater slot indices). After hitting
-  // that end index, the current segment being processed gets bumped until they
-  // are all processed or the max instruction count is hit, where everything is
-  // just truncated.
-  while (true) {
-    // If the index that we are currently at is within the current segment and
-    // we haven't hit the max instruction count, continue processing the current
-    // segment.
-    while (CurrentIndex <= LRPosInfo[CurrentSegmentIndex].End &&
-           InstructionIndex < ModelMaxSupportedInstructionCount) {
-      int CurrentOpcode = GetOpcode(CurrentIndex);
-      // If the current machine instruction is null, skip it
-      if (CurrentOpcode == -1) {
-        // If we're currently at the last index in the SlotIndex analysis,
-        // we can't go any further, so return from the function
-        if (CurrentIndex >= LastIndex) {
-          return;
-        }
-        CurrentIndex = CurrentIndex.getNextIndex();
-        continue;
-      }
-      MachineBasicBlock *CurrentMBBReference = GetMBBReference(CurrentIndex);
-      if (VisitedMBBs.count(CurrentMBBReference) == 0) {
-        VisitedMBBs[CurrentMBBReference] = CurrentMBBIndex;
-        ++CurrentMBBIndex;
-      }
-      extractMBBFrequency(CurrentIndex, InstructionIndex, VisitedMBBs,
-                          GetMBBFreq, CurrentMBBReference, RegallocRunner,
-                          MBBFreqIndex, MBBMappingIndex);
-      // Current code assumes we're not going to get any disjointed segments
-      assert(LRPosInfo[CurrentSegmentIndex].Begin <= CurrentIndex);
-      RegallocRunner->getTensor<int64_t>(InstructionsIndex)[InstructionIndex] =
-          CurrentOpcode < OpcodeValueCutoff ? CurrentOpcode : 0;
-      // set value in the binary mapping matrix for the current instruction
-      auto CurrentSegmentPosition = LRPosInfo[CurrentSegmentIndex].Pos;
-      RegallocRunner->getTensor<int64_t>(
-          InstructionsMappingIndex)[CurrentSegmentPosition *
-                                        ModelMaxSupportedInstructionCount +
-                                    InstructionIndex] = 1;
-      // All of the segments are sorted based on the beginning slot index, but
-      // this doesn't mean that the beginning slot index of the next segment is
-      // after the end segment of the one being currently processed. This while
-      // loop checks for overlapping segments and modifies the portion of the
-      // column in the mapping matrix for the currently processed instruction
-      // for the LR it is checking. Also make sure that the beginning of the
-      // current segment we're checking for overlap in is less than the current
-      // index, otherwise we're done checking overlaps.
-      size_t OverlapCheckCurrentSegment = CurrentSegmentIndex + 1;
-      while (OverlapCheckCurrentSegment < LRPosInfo.size() &&
-             LRPosInfo[OverlapCheckCurrentSegment].Begin <= CurrentIndex) {
-        auto OverlapCurrentSegmentPosition =
-            LRPosInfo[OverlapCheckCurrentSegment].Pos;
-        if (LRPosInfo[OverlapCheckCurrentSegment].End >= CurrentIndex) {
-          RegallocRunner->getTensor<int64_t>(
-              InstructionsMappingIndex)[OverlapCurrentSegmentPosition *
-                                            ModelMaxSupportedInstructionCount +
-                                        InstructionIndex] = 1;
-        }
-        ++OverlapCheckCurrentSegment;
-      }
-      ++InstructionIndex;
-      if (CurrentIndex >= LastIndex) {
-        return;
-      }
-      CurrentIndex = CurrentIndex.getNextIndex();
-    }
-    // if we've just finished processing through the last segment or if we've
-    // hit the maximum number of instructions, break out of the loop.
-    if (CurrentSegmentIndex == LRPosInfo.size() - 1 ||
-        InstructionIndex >= ModelMaxSupportedInstructionCount) {
-      break;
-    }
-    // If the segments are not overlapping, we need to move to the beginning
-    // index of the next segment to avoid having instructions not attached to
-    // any register.
-    if (LRPosInfo[CurrentSegmentIndex + 1].Begin >
-        LRPosInfo[CurrentSegmentIndex].End) {
-      CurrentIndex = LRPosInfo[CurrentSegmentIndex + 1].Begin;
-    }
-    ++CurrentSegmentIndex;
-  }
-}
-
-void llvm::extractMBBFrequency(
-    const SlotIndex CurrentIndex, const size_t CurrentInstructionIndex,
-    std::map<MachineBasicBlock *, size_t> &VisitedMBBs,
-    function_ref<float(SlotIndex)> GetMBBFreq,
-    MachineBasicBlock *CurrentMBBReference, MLModelRunner *RegallocRunner,
-    const int MBBFreqIndex, const int MBBMappingIndex) {
-  size_t CurrentMBBIndex = VisitedMBBs[CurrentMBBReference];
-  float CurrentMBBFreq = GetMBBFreq(CurrentIndex);
-  if (CurrentMBBIndex < ModelMaxSupportedMBBCount) {
-    RegallocRunner->getTensor<float>(MBBFreqIndex)[CurrentMBBIndex] =
-        CurrentMBBFreq;
-    RegallocRunner->getTensor<int64_t>(
-        MBBMappingIndex)[CurrentInstructionIndex] = CurrentMBBIndex;
-  }
-}
-
 // Development mode-specific implementations
 #ifdef LLVM_HAVE_TFLITE
 
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ea69d47b234e2..b9c724156631e 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -4675,12 +4676,38 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
+    bool HasPrettyPrintedArgs =
+        isa<IntrinsicInst>(CI) &&
+        Intrinsic::hasPrettyPrintedArgs(CI->getIntrinsicID());
+
     ListSeparator LS;
-    for (unsigned op = 0, Eop = CI->arg_size(); op < Eop; ++op) {
-      Out << LS;
-      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op));
+    Function *CalledFunc = CI->getCalledFunction();
+    auto PrintArgComment = [&](unsigned ArgNo) {
+      const auto *ConstArg = dyn_cast<Constant>(CI->getArgOperand(ArgNo));
+      if (!ConstArg)
+        return;
+      std::string ArgComment;
+      raw_string_ostream ArgCommentStream(ArgComment);
+      Intrinsic::ID IID = CalledFunc->getIntrinsicID();
+      Intrinsic::printImmArg(IID, ArgNo, ArgCommentStream, ConstArg);
+      if (ArgComment.empty())
+        return;
+      Out << "/* " << ArgComment << " */ ";
+    };
+    if (HasPrettyPrintedArgs) {
+      for (unsigned ArgNo = 0, NumArgs = CI->arg_size(); ArgNo < NumArgs;
+           ++ArgNo) {
+        Out << LS;
+        PrintArgComment(ArgNo);
+        writeParamOperand(CI->getArgOperand(ArgNo), PAL.getParamAttrs(ArgNo));
+      }
+    } else {
+      for (unsigned ArgNo = 0, NumArgs = CI->arg_size(); ArgNo < NumArgs;
+           ++ArgNo) {
+        Out << LS;
+        writeParamOperand(CI->getArgOperand(ArgNo), PAL.getParamAttrs(ArgNo));
+      }
     }
-
     // Emit an ellipsis if this is a musttail call in a vararg function.  This
     // is only to aid readability, musttail calls forward varargs by default.
     if (CI->isMustTailCall() && CI->getParent() &&
@@ -5104,12 +5131,10 @@ void AssemblyWriter::printUseLists(const Function *F) {
 //===----------------------------------------------------------------------===//
 
 void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
-                     bool ShouldPreserveUseListOrder,
-                     bool IsForDebug) const {
+                     bool ShouldPreserveUseListOrder, bool IsForDebug) const {
   SlotTracker SlotTable(this->getParent());
   formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW,
-                   IsForDebug,
+  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printFunction(this);
 }
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 526800e217399..859689b9cf168 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsXCore.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
@@ -601,6 +602,12 @@ bool Intrinsic::isOverloaded(ID id) {
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
+bool Intrinsic::hasPrettyPrintedArgs(ID id){
+#define GET_INTRINSIC_PRETTY_PRINT_TABLE
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_PRETTY_PRINT_TABLE
+}
+
 /// Table of per-target intrinsic name tables.
 #define GET_INTRINSIC_TARGET_DATA
 #include "llvm/IR/IntrinsicImpl.inc"
@@ -1142,3 +1149,7 @@ Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
   assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
   return InterleaveIntrinsics[Factor - 2].Deinterleave;
 }
+
+#define GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
diff --git a/llvm/lib/MC/MCInst.cpp b/llvm/lib/MC/MCInst.cpp
index 46a6a18e15963..61eeb5e5a5c71 100644
--- a/llvm/lib/MC/MCInst.cpp
+++ b/llvm/lib/MC/MCInst.cpp
@@ -29,7 +29,7 @@ void MCOperand::print(raw_ostream &OS, const MCContext *Ctx) const {
     if (Ctx && Ctx->getRegisterInfo())
       OS << Ctx->getRegisterInfo()->getName(getReg());
     else
-      OS << getReg();
+      OS << getReg().id();
   } else if (isImm())
     OS << "Imm:" << getImm();
   else if (isSFPImm())
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 233176ebe2b1f..dc482210d873c 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -46,6 +46,7 @@
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Base64.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -532,6 +533,7 @@ class AsmParser : public MCAsmParser {
     DK_LTO_SET_CONDITIONAL,
     DK_CFI_MTE_TAGGED_FRAME,
     DK_MEMTAG,
+    DK_BASE64,
     DK_END
   };
 
@@ -554,6 +556,7 @@ class AsmParser : public MCAsmParser {
 
   // ".ascii", ".asciz", ".string"
   bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveBase64();                  // ".base64"
   bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
   bool parseDirectiveValue(StringRef IDVal,
                            unsigned Size);       // ".byte", ".long", ...
@@ -1959,6 +1962,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     case DK_ASCIZ:
     case DK_STRING:
       return parseDirectiveAscii(IDVal, true);
+    case DK_BASE64:
+      return parseDirectiveBase64();
     case DK_BYTE:
     case DK_DC_B:
       return parseDirectiveValue(IDVal, 1);
@@ -3088,6 +3093,37 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return parseMany(parseOp);
 }
 
+/// parseDirectiveBase64:
+//    ::= .base64 "string" (, "string" )*
+bool AsmParser::parseDirectiveBase64() {
+  auto parseOp = [&]() -> bool {
+    if (checkForValidSection())
+      return true;
+
+    if (getTok().isNot(AsmToken::String)) {
+      return true;
+    }
+
+    std::vector<char> Decoded;
+    std::string const str = getTok().getStringContents().str();
+    if (check(str.empty(), "expected nonempty string")) {
+      return true;
+    }
+
+    llvm::Error e = decodeBase64(str, Decoded);
+    if (e) {
+      consumeError(std::move(e));
+      return Error(Lexer.getLoc(), "failed to base64 decode string data");
+    }
+
+    getStreamer().emitBytes(std::string(Decoded.begin(), Decoded.end()));
+    Lex();
+    return false;
+  };
+
+  return check(parseMany(parseOp), "expected string");
+}
+
 /// parseDirectiveReloc
 ///  ::= .reloc expression , identifier [ , expression ]
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
@@ -5442,6 +5478,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".asciz"] = DK_ASCIZ;
   DirectiveKindMap[".string"] = DK_STRING;
   DirectiveKindMap[".byte"] = DK_BYTE;
+  DirectiveKindMap[".base64"] = DK_BASE64;
   DirectiveKindMap[".short"] = DK_SHORT;
   DirectiveKindMap[".value"] = DK_VALUE;
   DirectiveKindMap[".2byte"] = DK_2BYTE;
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index 7fd92bf974b95..77fb7332619cd 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -89,7 +89,7 @@ ArrayRef<MCPhysReg> MCRegisterInfo::getCachedAliasesOf(MCRegister R) const {
     return Aliases;
 
   for (MCRegAliasIteratorImpl It(R, this); It.isValid(); ++It)
-    Aliases.push_back(*It);
+    Aliases.push_back((*It).id());
 
   sort(Aliases);
   Aliases.erase(unique(Aliases), Aliases.end());
@@ -141,15 +141,15 @@ unsigned MCRegisterInfo::getSubRegIndex(MCRegister Reg,
   return 0;
 }
 
-int64_t MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
+int64_t MCRegisterInfo::getDwarfRegNum(MCRegister Reg, bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
   unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
 
   if (!M)
     return -1;
-  DwarfLLVMRegPair Key = { RegNum, 0 };
+  DwarfLLVMRegPair Key = {Reg.id(), 0};
   const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-  if (I == M+Size || I->FromReg != RegNum)
+  if (I == M + Size || I->FromReg != Reg)
     return -1;
   // Consumers need to be able to detect -1 and -2, but at various points
   // the numbers move between unsigned and signed representations, as well as
@@ -191,20 +191,21 @@ int64_t MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(uint64_t RegNum) const {
   return RegNum;
 }
 
-int MCRegisterInfo::getSEHRegNum(MCRegister RegNum) const {
-  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(RegNum);
-  if (I == L2SEHRegs.end()) return (int)RegNum;
+int MCRegisterInfo::getSEHRegNum(MCRegister Reg) const {
+  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(Reg);
+  if (I == L2SEHRegs.end())
+    return (int)Reg.id();
   return I->second;
 }
 
-int MCRegisterInfo::getCodeViewRegNum(MCRegister RegNum) const {
+int MCRegisterInfo::getCodeViewRegNum(MCRegister Reg) const {
   if (L2CVRegs.empty())
     report_fatal_error("target does not implement codeview register mapping");
-  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(RegNum);
+  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(Reg);
   if (I == L2CVRegs.end())
-    report_fatal_error("unknown codeview register " + (RegNum < getNumRegs()
-                                                           ? getName(RegNum)
-                                                           : Twine(RegNum)));
+    report_fatal_error("unknown codeview register " + (Reg.id() < getNumRegs()
+                                                           ? getName(Reg)
+                                                           : Twine(Reg.id())));
   return I->second;
 }
 
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index b8fbfd21c4f28..b90f4e0714458 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -32,6 +32,34 @@
 #endif
 #endif
 
+static void RaiseLimits() {
+#ifdef _AIX
+  // AIX has restrictive memory soft-limits out-of-box, so raise them if needed.
+  auto RaiseLimit = [](int resource) {
+    struct rlimit r;
+    getrlimit(resource, &r);
+
+    // Increase the soft limit to the hard limit, if necessary and
+    // possible.
+    if (r.rlim_cur != RLIM_INFINITY && r.rlim_cur != r.rlim_max) {
+      r.rlim_cur = r.rlim_max;
+      setrlimit(resource, &r);
+    }
+  };
+
+  // Address space size.
+  RaiseLimit(RLIMIT_AS);
+  // Heap size.
+  RaiseLimit(RLIMIT_DATA);
+  // Stack size.
+  RaiseLimit(RLIMIT_STACK);
+#ifdef RLIMIT_RSS
+  // Resident set size.
+  RaiseLimit(RLIMIT_RSS);
+#endif
+#endif
+}
+
 void CleanupStdHandles(void *Cookie) {
   llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs();
   Outs->flush();
@@ -67,6 +95,7 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
   StackPrinter.emplace(Argc, Argv);
   sys::PrintStackTraceOnErrorSignal(Argv[0]);
   install_out_of_memory_new_handler();
+  RaiseLimits();
 
 #ifdef __MVS__
 
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index c3869c3fb9a5a..3330b70cdc2e1 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -167,12 +167,11 @@ int llvm::TableGenMain(const char *argv0,
 
   // Write output to memory.
   Timer.startBackendTimer("Backend overall");
-  SmallString<128> FilenamePrefix(OutputFilename);
-  sys::path::replace_extension(FilenamePrefix, "");
   TableGenOutputFiles OutFiles;
   unsigned status = 0;
   // ApplyCallback will return true if it did not apply any callback. In that
   // case, attempt to apply the MainFn.
+  StringRef FilenamePrefix(sys::path::stem(OutputFilename));
   if (TableGen::Emitter::ApplyCallback(Records, OutFiles, FilenamePrefix))
     status = MainFn ? MainFn(OutFiles, Records) : 1;
   Timer.stopBackendTimer();
@@ -195,7 +194,7 @@ int llvm::TableGenMain(const char *argv0,
     SmallString<128> Filename(OutputFilename);
     // TODO: Format using the split-file convention when writing to stdout?
     if (Filename != "-") {
-      Filename = FilenamePrefix;
+      sys::path::replace_extension(Filename, "");
       Filename.append(Suffix);
     }
     if (int Ret = WriteOutput(Parser, argv0, Filename, Content))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 35836af3c874b..42567883b2594 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1427,12 +1427,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
 
     // ADDP custom lowering
     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6728,8 +6740,34 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
   return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
 }
 
+/// Helper function to check if a small vector load can be optimized.
+static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD,
+                                            const AArch64Subtarget &Subtarget) {
+  if (!Subtarget.isNeonAvailable())
+    return false;
+  if (LD->isVolatile())
+    return false;
+
+  EVT MemVT = LD->getMemoryVT();
+  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
+    return false;
+
+  Align Alignment = LD->getAlign();
+  Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
+  if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
+    return false;
+
+  return true;
+}
+
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT ExtVT = ExtVal.getValueType();
+  // Small, illegal vectors can be extended inreg.
+  if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
+    if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
+        isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
+      return true;
+  }
   if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
     return false;
 
@@ -7188,12 +7226,86 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   return Result;
 }
 
+/// Helper function to optimize loads of extended small vectors.
+/// These patterns would otherwise get scalarized into inefficient sequences.
+static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG) {
+  const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
+    return SDValue();
+
+  EVT MemVT = Load->getMemoryVT();
+  EVT ResVT = Load->getValueType(0);
+  unsigned NumElts = ResVT.getVectorNumElements();
+  unsigned DstEltBits = ResVT.getScalarSizeInBits();
+  unsigned SrcEltBits = MemVT.getScalarSizeInBits();
+
+  unsigned ExtOpcode;
+  switch (Load->getExtensionType()) {
+  case ISD::EXTLOAD:
+  case ISD::ZEXTLOAD:
+    ExtOpcode = ISD::ZERO_EXTEND;
+    break;
+  case ISD::SEXTLOAD:
+    ExtOpcode = ISD::SIGN_EXTEND;
+    break;
+  case ISD::NON_EXTLOAD:
+    return SDValue();
+  }
+
+  SDLoc DL(Load);
+  SDValue Chain = Load->getChain();
+  SDValue BasePtr = Load->getBasePtr();
+  const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
+  Align Alignment = Load->getAlign();
+
+  // Load the data as an FP scalar to avoid issues with integer loads.
+  unsigned LoadBits = MemVT.getStoreSizeInBits();
+  MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
+  SDValue ScalarLoad =
+      DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
+
+  MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
+  SDValue ScalarToVec =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
+  MVT BitcastTy =
+      MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
+
+  SDValue Res = Bitcast;
+  unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
+  unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
+  while (CurrentEltBits < DstEltBits) {
+    if (Res.getValueSizeInBits() >= 128) {
+      CurrentNumElts = CurrentNumElts / 2;
+      MVT ExtractVT =
+          MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
+                        DAG.getConstant(0, DL, MVT::i64));
+    }
+    CurrentEltBits = CurrentEltBits * 2;
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+    Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
+  }
+
+  if (CurrentNumElts != NumElts) {
+    MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
+                      DAG.getConstant(0, DL, MVT::i64));
+  }
+
+  return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
+}
+
 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
                                          SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
 
+  if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
+    return Result;
+
   if (LoadNode->getMemoryVT() == MVT::i64x8) {
     SmallVector<SDValue, 8> Ops;
     SDValue Base = LoadNode->getBasePtr();
@@ -7212,37 +7324,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
     return DAG.getMergeValues({Loaded, Chain}, DL);
   }
 
-  // Custom lowering for extending v4i8 vector loads.
-  EVT VT = Op->getValueType(0);
-  assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
-
-  if (LoadNode->getMemoryVT() != MVT::v4i8)
-    return SDValue();
-
-  // Avoid generating unaligned loads.
-  if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
-    return SDValue();
-
-  unsigned ExtType;
-  if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
-    ExtType = ISD::SIGN_EXTEND;
-  else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
-           LoadNode->getExtensionType() == ISD::EXTLOAD)
-    ExtType = ISD::ZERO_EXTEND;
-  else
-    return SDValue();
-
-  SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
-                             LoadNode->getBasePtr(), MachinePointerInfo());
-  SDValue Chain = Load.getValue(1);
-  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
-  SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
-  SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
-  Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
-                    DAG.getConstant(0, DL, MVT::i64));
-  if (VT == MVT::v4i32)
-    Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
-  return DAG.getMergeValues({Ext, Chain}, DL);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 221812f1ebc7b..00fe8ee8b9b4d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1144,6 +1144,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
   return Is.size() <= 2;
 }
 
+// Check if a COPY instruction is cheap.
+static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
+  assert(MI.isCopy() && "Expected COPY instruction");
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+  // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
+  // typically requiring an FMOV instruction with a 2-6 cycle latency.
+  auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
+    if (Reg.isVirtual())
+      return MRI.getRegClass(Reg);
+    if (Reg.isPhysical())
+      return RI.getMinimalPhysRegClass(Reg);
+    return nullptr;
+  };
+  const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
+  const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
+  if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
+    return false;
+
+  return MI.isAsCheapAsAMove();
+}
+
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
@@ -1157,6 +1179,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   default:
     return MI.isAsCheapAsAMove();
 
+  case TargetOpcode::COPY:
+    return isCheapCopy(MI, RI);
+
   case AArch64::ADDWrs:
   case AArch64::ADDXrs:
   case AArch64::SUBWrs:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 29f8f9bc8b54c..8bfdbb7c5c310 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                                    const MCExpr *NumVGPRs,
+                                    unsigned DynamicVGPRBlockSize,
+                                    const GCNSubtarget &STM, MCContext &Ctx) {
+  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
+  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+  unsigned Generation = STM.getGeneration();
+
+  auto CreateExpr = [&Ctx](unsigned Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
+                              {CreateExpr(MaxWaves), CreateExpr(Granule),
+                               CreateExpr(TargetTotalNumVGPRs),
+                               CreateExpr(Generation), CreateExpr(InitOcc),
+                               NumSGPRs, NumVGPRs},
+                              Ctx);
+}
+
 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
     return;
@@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
                         MaxWaves, MFI.getDynamicVGPRBlockSize())});
       uint64_t NumSGPRsForWavesPerEU = std::max(
           {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
-      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+      const MCExpr *OccupancyExpr = createOccupancy(
           STM.getOccupancyWithWorkGroupSizes(*MF).second,
           MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
           MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
@@ -1270,7 +1296,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
 
-  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
+  ProgInfo.Occupancy = createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
       MFI->getDynamicVGPRBlockSize(), STM, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 90114e44f1a48..b81a08de383d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -935,7 +935,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
-  addRulesForGOpcs({G_FADD}, Standard)
+  addRulesForGOpcs({G_FADD, G_FMUL}, Standard)
       .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
       .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index c27be0250e386..093c85ecabab0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCExpr.h"
-#include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
   return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
 }
 
-/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
-///
-/// Remove dependency on GCNSubtarget and depend only only the necessary values
-/// for said occupancy computation. Should match computeOccupancy implementation
-/// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
-    unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
-    unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
-  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
-  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
-  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
-  unsigned Generation = STM.getGeneration();
-
-  auto CreateExpr = [&Ctx](unsigned Value) {
-    return MCConstantExpr::create(Value, Ctx);
-  };
-
-  return create(AGVK_Occupancy,
-                {CreateExpr(MaxWaves), CreateExpr(Granule),
-                 CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
-                 CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
-                Ctx);
-}
-
 const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
                                             MCContext &Ctx) {
   assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 246a3f88ebce4..bf7b40b1851da 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -98,11 +98,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
     return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
   }
 
-  static const AMDGPUMCExpr *
-  createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
-                  const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
-                  const GCNSubtarget &STM, MCContext &Ctx);
-
   static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
                                        MCContext &Ctx);
 
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 2182039e0eef8..53d565013c4bc 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM PPCGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM PPCGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM PPCGenExegesis.inc -gen-exegesis)
 tablegen(LLVM PPCGenRegisterBank.inc -gen-register-bank)
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index ea4e597d0fd7d..ca3fe18273ff5 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -17,6 +17,7 @@
 #include "PPCCallingConv.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 89165fa8f8fdb..dd537c204cec1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APInt.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f55336bafd251..220010c4d3d34 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCRegisterInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APFloat.h"
@@ -1678,190 +1679,6 @@ bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
   return false;
 }
 
-const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((PPCISD::NodeType)Opcode) {
-  case PPCISD::FIRST_NUMBER:    break;
-  case PPCISD::FSEL:            return "PPCISD::FSEL";
-  case PPCISD::XSMAXC:          return "PPCISD::XSMAXC";
-  case PPCISD::XSMINC:          return "PPCISD::XSMINC";
-  case PPCISD::FCFID:           return "PPCISD::FCFID";
-  case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
-  case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
-  case PPCISD::FCFIDUS:         return "PPCISD::FCFIDUS";
-  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
-  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
-  case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
-  case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
-  case PPCISD::FRE:             return "PPCISD::FRE";
-  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
-  case PPCISD::FTSQRT:
-    return "PPCISD::FTSQRT";
-  case PPCISD::FSQRT:
-    return "PPCISD::FSQRT";
-  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
-  case PPCISD::VPERM:           return "PPCISD::VPERM";
-  case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
-  case PPCISD::XXSPLTI_SP_TO_DP:
-    return "PPCISD::XXSPLTI_SP_TO_DP";
-  case PPCISD::XXSPLTI32DX:
-    return "PPCISD::XXSPLTI32DX";
-  case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
-  case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
-  case PPCISD::XXPERM:
-    return "PPCISD::XXPERM";
-  case PPCISD::VECSHL:          return "PPCISD::VECSHL";
-  case PPCISD::VSRQ:
-    return "PPCISD::VSRQ";
-  case PPCISD::CMPB:            return "PPCISD::CMPB";
-  case PPCISD::Hi:              return "PPCISD::Hi";
-  case PPCISD::Lo:              return "PPCISD::Lo";
-  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
-  case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
-  case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
-  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
-  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
-  case PPCISD::PROBED_ALLOCA:   return "PPCISD::PROBED_ALLOCA";
-  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
-  case PPCISD::SRL:             return "PPCISD::SRL";
-  case PPCISD::SRA:             return "PPCISD::SRA";
-  case PPCISD::SHL:             return "PPCISD::SHL";
-  case PPCISD::SRA_ADDZE:       return "PPCISD::SRA_ADDZE";
-  case PPCISD::CALL:            return "PPCISD::CALL";
-  case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
-  case PPCISD::CALL_NOTOC:      return "PPCISD::CALL_NOTOC";
-  case PPCISD::CALL_RM:
-    return "PPCISD::CALL_RM";
-  case PPCISD::CALL_NOP_RM:
-    return "PPCISD::CALL_NOP_RM";
-  case PPCISD::CALL_NOTOC_RM:
-    return "PPCISD::CALL_NOTOC_RM";
-  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
-  case PPCISD::BCTRL:           return "PPCISD::BCTRL";
-  case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
-  case PPCISD::BCTRL_RM:
-    return "PPCISD::BCTRL_RM";
-  case PPCISD::BCTRL_LOAD_TOC_RM:
-    return "PPCISD::BCTRL_LOAD_TOC_RM";
-  case PPCISD::RET_GLUE:        return "PPCISD::RET_GLUE";
-  case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
-  case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
-  case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
-  case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
-  case PPCISD::MFVSR:           return "PPCISD::MFVSR";
-  case PPCISD::MTVSRA:          return "PPCISD::MTVSRA";
-  case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
-  case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
-  case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
-  case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
-    return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
-  case PPCISD::ANDI_rec_1_EQ_BIT:
-    return "PPCISD::ANDI_rec_1_EQ_BIT";
-  case PPCISD::ANDI_rec_1_GT_BIT:
-    return "PPCISD::ANDI_rec_1_GT_BIT";
-  case PPCISD::VCMP:            return "PPCISD::VCMP";
-  case PPCISD::VCMP_rec:        return "PPCISD::VCMP_rec";
-  case PPCISD::LBRX:            return "PPCISD::LBRX";
-  case PPCISD::STBRX:           return "PPCISD::STBRX";
-  case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
-  case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
-  case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
-  case PPCISD::STXSIX:          return "PPCISD::STXSIX";
-  case PPCISD::VEXTS:           return "PPCISD::VEXTS";
-  case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
-  case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
-  case PPCISD::LOAD_VEC_BE:     return "PPCISD::LOAD_VEC_BE";
-  case PPCISD::STORE_VEC_BE:    return "PPCISD::STORE_VEC_BE";
-  case PPCISD::ST_VSR_SCAL_INT:
-                                return "PPCISD::ST_VSR_SCAL_INT";
-  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
-  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
-  case PPCISD::BDZ:             return "PPCISD::BDZ";
-  case PPCISD::MFFS:            return "PPCISD::MFFS";
-  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
-  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
-  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
-  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
-  case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
-  case PPCISD::PPC32_PICGOT:    return "PPCISD::PPC32_PICGOT";
-  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
-  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
-  case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
-  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
-  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
-  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
-  case PPCISD::GET_TLS_MOD_AIX: return "PPCISD::GET_TLS_MOD_AIX";
-  case PPCISD::GET_TPOINTER:    return "PPCISD::GET_TPOINTER";
-  case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
-  case PPCISD::TLSGD_AIX:       return "PPCISD::TLSGD_AIX";
-  case PPCISD::TLSLD_AIX:       return "PPCISD::TLSLD_AIX";
-  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
-  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
-  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
-  case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
-  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
-  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
-  case PPCISD::PADDI_DTPREL:
-    return "PPCISD::PADDI_DTPREL";
-  case PPCISD::VADD_SPLAT:
-    return "PPCISD::VADD_SPLAT";
-  case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
-  case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
-  case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
-  case PPCISD::BUILD_SPE64:     return "PPCISD::BUILD_SPE64";
-  case PPCISD::EXTRACT_SPE:     return "PPCISD::EXTRACT_SPE";
-  case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
-  case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
-  case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
-  case PPCISD::MAT_PCREL_ADDR:  return "PPCISD::MAT_PCREL_ADDR";
-  case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
-    return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
-  case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
-    return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
-  case PPCISD::ACC_BUILD:       return "PPCISD::ACC_BUILD";
-  case PPCISD::PAIR_BUILD:      return "PPCISD::PAIR_BUILD";
-  case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
-  case PPCISD::XXMFACC:         return "PPCISD::XXMFACC";
-  case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
-  case PPCISD::ZEXT_LD_SPLAT:   return "PPCISD::ZEXT_LD_SPLAT";
-  case PPCISD::SEXT_LD_SPLAT:   return "PPCISD::SEXT_LD_SPLAT";
-  case PPCISD::FNMSUB:          return "PPCISD::FNMSUB";
-  case PPCISD::STRICT_FADDRTZ:
-    return "PPCISD::STRICT_FADDRTZ";
-  case PPCISD::STRICT_FCTIDZ:
-    return "PPCISD::STRICT_FCTIDZ";
-  case PPCISD::STRICT_FCTIWZ:
-    return "PPCISD::STRICT_FCTIWZ";
-  case PPCISD::STRICT_FCTIDUZ:
-    return "PPCISD::STRICT_FCTIDUZ";
-  case PPCISD::STRICT_FCTIWUZ:
-    return "PPCISD::STRICT_FCTIWUZ";
-  case PPCISD::STRICT_FCFID:
-    return "PPCISD::STRICT_FCFID";
-  case PPCISD::STRICT_FCFIDU:
-    return "PPCISD::STRICT_FCFIDU";
-  case PPCISD::STRICT_FCFIDS:
-    return "PPCISD::STRICT_FCFIDS";
-  case PPCISD::STRICT_FCFIDUS:
-    return "PPCISD::STRICT_FCFIDUS";
-  case PPCISD::LXVRZX:          return "PPCISD::LXVRZX";
-  case PPCISD::STORE_COND:
-    return "PPCISD::STORE_COND";
-  case PPCISD::SETBC:
-    return "PPCISD::SETBC";
-  case PPCISD::SETBCR:
-    return "PPCISD::SETBCR";
-  case PPCISD::ADDC:
-    return "PPCISD::ADDC";
-  case PPCISD::ADDE:
-    return "PPCISD::ADDE";
-  case PPCISD::SUBC:
-    return "PPCISD::SUBC";
-  case PPCISD::SUBE:
-    return "PPCISD::SUBE";
-  }
-  return nullptr;
-}
-
 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
                                           EVT VT) const {
   if (!VT.isVector())
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index d967018982734..680b529b4e2e5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -34,580 +34,6 @@
 
 namespace llvm {
 
-  namespace PPCISD {
-
-  // When adding a NEW PPCISD node please add it to the correct position in
-  // the enum. The order of elements in this enum matters!
-  // Values that are added between FIRST_MEMORY_OPCODE and LAST_MEMORY_OPCODE
-  // are considered memory opcodes and are treated differently than other
-  // entries.
-  enum NodeType : unsigned {
-    // Start the numbering where the builtin ops and target ops leave off.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    /// FSEL - Traditional three-operand fsel node.
-    ///
-    FSEL,
-
-    /// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
-    XSMAXC,
-    XSMINC,
-
-    /// FCFID - The FCFID instruction, taking an f64 operand and producing
-    /// and f64 value containing the FP representation of the integer that
-    /// was temporarily in the f64 operand.
-    FCFID,
-
-    /// Newer FCFID[US] integer-to-floating-point conversion instructions for
-    /// unsigned integers and single-precision outputs.
-    FCFIDU,
-    FCFIDS,
-    FCFIDUS,
-
-    /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
-    /// operand, producing an f64 value containing the integer representation
-    /// of that FP value.
-    FCTIDZ,
-    FCTIWZ,
-
-    /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
-    /// unsigned integers with round toward zero.
-    FCTIDUZ,
-    FCTIWUZ,
-
-    /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
-    /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
-    VEXTS,
-
-    /// Reciprocal estimate instructions (unary FP ops).
-    FRE,
-    FRSQRTE,
-
-    /// Test instruction for software square root.
-    FTSQRT,
-
-    /// Square root instruction.
-    FSQRT,
-
-    /// VPERM - The PPC VPERM Instruction.
-    ///
-    VPERM,
-
-    /// XXSPLT - The PPC VSX splat instructions
-    ///
-    XXSPLT,
-
-    /// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
-    /// converting immediate single precision numbers to double precision
-    /// vector or scalar.
-    XXSPLTI_SP_TO_DP,
-
-    /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
-    ///
-    XXSPLTI32DX,
-
-    /// VECINSERT - The PPC vector insert instruction
-    ///
-    VECINSERT,
-
-    /// VECSHL - The PPC vector shift left instruction
-    ///
-    VECSHL,
-
-    /// XXPERMDI - The PPC XXPERMDI instruction
-    ///
-    XXPERMDI,
-    XXPERM,
-
-    /// The CMPB instruction (takes two operands of i32 or i64).
-    CMPB,
-
-    /// Hi/Lo - These represent the high and low 16-bit parts of a global
-    /// address respectively.  These nodes have two operands, the first of
-    /// which must be a TargetGlobalAddress, and the second of which must be a
-    /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
-    /// though these are usually folded into other nodes.
-    Hi,
-    Lo,
-
-    /// The following two target-specific nodes are used for calls through
-    /// function pointers in the 64-bit SVR4 ABI.
-
-    /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
-    /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
-    /// compute an allocation on the stack.
-    DYNALLOC,
-
-    /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
-    /// compute an offset from native SP to the address  of the most recent
-    /// dynamic alloca.
-    DYNAREAOFFSET,
-
-    /// To avoid stack clash, allocation is performed by block and each block is
-    /// probed.
-    PROBED_ALLOCA,
-
-    /// The result of the mflr at function entry, used for PIC code.
-    GlobalBaseReg,
-
-    /// These nodes represent PPC shifts.
-    ///
-    /// For scalar types, only the last `n + 1` bits of the shift amounts
-    /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
-    /// for exact behaviors.
-    ///
-    /// For vector types, only the last n bits are used. See vsld.
-    SRL,
-    SRA,
-    SHL,
-
-    /// These nodes represent PPC arithmetic operations with carry.
-    ADDC,
-    ADDE,
-    SUBC,
-    SUBE,
-
-    /// FNMSUB - Negated multiply-subtract instruction.
-    FNMSUB,
-
-    /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
-    /// word and shift left immediate.
-    EXTSWSLI,
-
-    /// The combination of sra[wd]i and addze used to implemented signed
-    /// integer division by a power of 2. The first operand is the dividend,
-    /// and the second is the constant shift amount (representing the
-    /// divisor).
-    SRA_ADDZE,
-
-    /// CALL - A direct function call.
-    /// CALL_NOP is a call with the special NOP which follows 64-bit
-    /// CALL_NOTOC the caller does not use the TOC.
-    /// SVR4 calls and 32-bit/64-bit AIX calls.
-    CALL,
-    CALL_NOP,
-    CALL_NOTOC,
-
-    /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
-    /// MTCTR instruction.
-    MTCTR,
-
-    /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
-    /// BCTRL instruction.
-    BCTRL,
-
-    /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
-    /// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
-    /// and 64-bit AIX.
-    BCTRL_LOAD_TOC,
-
-    /// The variants that implicitly define rounding mode for calls with
-    /// strictfp semantics.
-    CALL_RM,
-    CALL_NOP_RM,
-    CALL_NOTOC_RM,
-    BCTRL_RM,
-    BCTRL_LOAD_TOC_RM,
-
-    /// Return with a glue operand, matched by 'blr'
-    RET_GLUE,
-
-    /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
-    /// This copies the bits corresponding to the specified CRREG into the
-    /// resultant GPR.  Bits corresponding to other CR regs are undefined.
-    MFOCRF,
-
-    /// Direct move from a VSX register to a GPR
-    MFVSR,
-
-    /// Direct move from a GPR to a VSX register (algebraic)
-    MTVSRA,
-
-    /// Direct move from a GPR to a VSX register (zero)
-    MTVSRZ,
-
-    /// Direct move of 2 consecutive GPR to a VSX register.
-    BUILD_FP128,
-
-    /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
-    /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
-    /// unsupported for this target.
-    /// Merge 2 GPRs to a single SPE register.
-    BUILD_SPE64,
-
-    /// Extract SPE register component, second argument is high or low.
-    EXTRACT_SPE,
-
-    /// Extract a subvector from signed integer vector and convert to FP.
-    /// It is primarily used to convert a (widened) illegal integer vector
-    /// type to a legal floating point vector type.
-    /// For example v2i32 -> widened to v4i32 -> v2f64
-    SINT_VEC_TO_FP,
-
-    /// Extract a subvector from unsigned integer vector and convert to FP.
-    /// As with SINT_VEC_TO_FP, used for converting illegal types.
-    UINT_VEC_TO_FP,
-
-    /// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
-    /// place the value into the least significant element of the most
-    /// significant doubleword in the vector. This is not element zero for
-    /// anything smaller than a doubleword on either endianness. This node has
-    /// the same semantics as SCALAR_TO_VECTOR except that the value remains in
-    /// the aforementioned location in the vector register.
-    SCALAR_TO_VECTOR_PERMUTED,
-
-    // FIXME: Remove these once the ANDI glue bug is fixed:
-    /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
-    /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
-    /// implement truncation of i32 or i64 to i1.
-    ANDI_rec_1_EQ_BIT,
-    ANDI_rec_1_GT_BIT,
-
-    // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
-    // target (returns (Lo, Hi)). It takes a chain operand.
-    READ_TIME_BASE,
-
-    // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
-    EH_SJLJ_SETJMP,
-
-    // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
-    EH_SJLJ_LONGJMP,
-
-    /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
-    /// instructions.  For lack of better number, we use the opcode number
-    /// encoding for the OPC field to identify the compare.  For example, 838
-    /// is VCMPGTSH.
-    VCMP,
-
-    /// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
-    /// altivec VCMP*_rec instructions.  For lack of better number, we use the
-    /// opcode number encoding for the OPC field to identify the compare.  For
-    /// example, 838 is VCMPGTSH.
-    VCMP_rec,
-
-    /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
-    /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
-    /// condition register to branch on, OPC is the branch opcode to use (e.g.
-    /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
-    /// an optional input flag argument.
-    COND_BRANCH,
-
-    /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
-    /// loops.
-    BDNZ,
-    BDZ,
-
-    /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
-    /// towards zero.  Used only as part of the long double-to-int
-    /// conversion sequence.
-    FADDRTZ,
-
-    /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
-    MFFS,
-
-    /// TC_RETURN - A tail call return.
-    ///   operand #0 chain
-    ///   operand #1 callee (register or absolute)
-    ///   operand #2 stack adjustment
-    ///   operand #3 optional in flag
-    TC_RETURN,
-
-    /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
-    CR6SET,
-    CR6UNSET,
-
-    /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
-    /// for non-position independent code on PPC32.
-    PPC32_GOT,
-
-    /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
-    /// local dynamic TLS and position indendepent code on PPC32.
-    PPC32_PICGOT,
-
-    /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
-    /// TLS model, produces an ADDIS8 instruction that adds the GOT
-    /// base to sym\@got\@tprel\@ha.
-    ADDIS_GOT_TPREL_HA,
-
-    /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
-    /// TLS model, produces a LD instruction with base register G8RReg
-    /// and offset sym\@got\@tprel\@l.  This completes the addition that
-    /// finds the offset of "sym" relative to the thread pointer.
-    LD_GOT_TPREL_L,
-
-    /// G8RC = ADD_TLS G8RReg, Symbol - Can be used by the initial-exec
-    /// and local-exec TLS models, produces an ADD instruction that adds
-    /// the contents of G8RReg to the thread pointer.  Symbol contains a
-    /// relocation sym\@tls which is to be replaced by the thread pointer
-    /// and identifies to the linker that the instruction is part of a
-    /// TLS sequence.
-    ADD_TLS,
-
-    /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds the GOT base
-    /// register to sym\@got\@tlsgd\@ha.
-    ADDIS_TLSGD_HA,
-
-    /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
-    /// ADDIS_TLSGD_L_ADDR until after register assignment.
-    ADDI_TLSGD_L,
-
-    /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
-    /// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
-    /// ADDIS_TLSGD_L_ADDR until after register assignment.
-    GET_TLS_ADDR,
-
-    /// %x3 = GET_TPOINTER - Used for the local- and initial-exec TLS model on
-    /// 32-bit AIX, produces a call to .__get_tpointer to retrieve the thread
-    /// pointer. At the end of the call, the thread pointer is found in R3.
-    GET_TPOINTER,
-
-    /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
-    /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
-    /// register assignment.
-    ADDI_TLSGD_L_ADDR,
-
-    /// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
-    /// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
-    /// Op that combines two register copies of TOC entries
-    /// (region handle into R3 and variable offset into R4) followed by a
-    /// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
-    /// This node is used in 64-bit mode as well (in which case the result is
-    /// G8RC and inputs are X3/X4).
-    TLSGD_AIX,
-
-    /// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
-    /// produces a call to .__tls_get_mod(_$TLSML\@ml).
-    GET_TLS_MOD_AIX,
-
-    /// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
-    /// Op that requires a single input of the module handle TOC entry in R3,
-    /// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
-    /// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
-    /// The only difference is the register class.
-    TLSLD_AIX,
-
-    /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds the GOT base
-    /// register to sym\@got\@tlsld\@ha.
-    ADDIS_TLSLD_HA,
-
-    /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
-    /// ADDIS_TLSLD_L_ADDR until after register assignment.
-    ADDI_TLSLD_L,
-
-    /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
-    /// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
-    /// ADDIS_TLSLD_L_ADDR until after register assignment.
-    GET_TLSLD_ADDR,
-
-    /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
-    /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
-    /// following register assignment.
-    ADDI_TLSLD_L_ADDR,
-
-    /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds X3 to
-    /// sym\@dtprel\@ha.
-    ADDIS_DTPREL_HA,
-
-    /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@dtprel\@l.
-    ADDI_DTPREL_L,
-
-    /// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
-    /// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
-    PADDI_DTPREL,
-
-    /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
-    /// during instruction selection to optimize a BUILD_VECTOR into
-    /// operations on splats.  This is necessary to avoid losing these
-    /// optimizations due to constant folding.
-    VADD_SPLAT,
-
-    /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
-    /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
-    /// or stxvd2x instruction.  The chain is necessary because the
-    /// sequence replaces a load and needs to provide the same number
-    /// of outputs.
-    XXSWAPD,
-
-    /// An SDNode for swaps that are not associated with any loads/stores
-    /// and thereby have no chain.
-    SWAP_NO_CHAIN,
-
-    /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
-    /// lower (IDX=1) half of v4f32 to v2f64.
-    FP_EXTEND_HALF,
-
-    /// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
-    /// either through an add like PADDI or through a PC Relative load like
-    /// PLD.
-    MAT_PCREL_ADDR,
-
-    /// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
-    /// TLS global address when using dynamic access models. This can be done
-    /// through an add like PADDI.
-    TLS_DYNAMIC_MAT_PCREL_ADDR,
-
-    /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
-    /// when using local exec access models, and when prefixed instructions are
-    /// available. This is used with ADD_TLS to produce an add like PADDI.
-    TLS_LOCAL_EXEC_MAT_ADDR,
-
-    /// ACC_BUILD = Build an accumulator register from 4 VSX registers.
-    ACC_BUILD,
-
-    /// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
-    PAIR_BUILD,
-
-    /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
-    /// an accumulator or pair register. This node is needed because
-    /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
-    /// element type.
-    EXTRACT_VSX_REG,
-
-    /// XXMFACC = This corresponds to the xxmfacc instruction.
-    XXMFACC,
-
-    // Constrained conversion from floating point to int
-    FIRST_STRICTFP_OPCODE,
-    STRICT_FCTIDZ = FIRST_STRICTFP_OPCODE,
-    STRICT_FCTIWZ,
-    STRICT_FCTIDUZ,
-    STRICT_FCTIWUZ,
-
-    /// Constrained integer-to-floating-point conversion instructions.
-    STRICT_FCFID,
-    STRICT_FCFIDU,
-    STRICT_FCFIDS,
-    STRICT_FCFIDUS,
-
-    /// Constrained floating point add in round-to-zero mode.
-    STRICT_FADDRTZ,
-    LAST_STRICTFP_OPCODE = STRICT_FADDRTZ,
-
-    /// SETBC - The ISA 3.1 (P10) SETBC instruction.
-    SETBC,
-
-    /// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
-    SETBCR,
-
-    /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
-    VSRQ,
-
-    // NOTE: The nodes below may require PC-Rel specific patterns if the
-    // address could be PC-Relative. When adding new nodes below, consider
-    // whether or not the address can be PC-Relative and add the corresponding
-    // PC-relative patterns and tests.
-
-    /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
-    /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
-    /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
-    /// i32.
-    FIRST_MEMORY_OPCODE,
-    STBRX = FIRST_MEMORY_OPCODE,
-
-    /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
-    /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
-    /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
-    /// or i32.
-    LBRX,
-
-    /// STFIWX - The STFIWX instruction.  The first operand is an input token
-    /// chain, then an f64 value to store, then an address to store it to.
-    STFIWX,
-
-    /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
-    /// load which sign-extends from a 32-bit integer value into the
-    /// destination 64-bit register.
-    LFIWAX,
-
-    /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
-    /// load which zero-extends from a 32-bit integer value into the
-    /// destination 64-bit register.
-    LFIWZX,
-
-    /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
-    /// integer smaller than 64 bits into a VSR. The integer is zero-extended.
-    /// This can be used for converting loaded integers to floating point.
-    LXSIZX,
-
-    /// STXSIX - The STXSI[bh]X instruction. The first operand is an input
-    /// chain, then an f64 value to store, then an address to store it to,
-    /// followed by a byte-width for the store.
-    STXSIX,
-
-    /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
-    /// Maps directly to an lxvd2x instruction that will be followed by
-    /// an xxswapd.
-    LXVD2X,
-
-    /// LXVRZX - Load VSX Vector Rightmost and Zero Extend
-    /// This node represents v1i128 BUILD_VECTOR of a zero extending load
-    /// instruction from <byte, halfword, word, or doubleword> to i128.
-    /// Allows utilization of the Load VSX Vector Rightmost Instructions.
-    LXVRZX,
-
-    /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
-    /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
-    /// the vector type to load vector in big-endian element order.
-    LOAD_VEC_BE,
-
-    /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
-    /// v2f32 value into the lower half of a VSR register.
-    LD_VSX_LH,
-
-    /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// instructions such as LXVDSX, LXVWSX.
-    LD_SPLAT,
-
-    /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// that zero-extends.
-    ZEXT_LD_SPLAT,
-
-    /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// that sign-extends.
-    SEXT_LD_SPLAT,
-
-    /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
-    /// Maps directly to an stxvd2x instruction that will be preceded by
-    /// an xxswapd.
-    STXVD2X,
-
-    /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
-    /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
-    /// the vector type to store vector in big-endian element order.
-    STORE_VEC_BE,
-
-    /// Store scalar integers from VSR.
-    ST_VSR_SCAL_INT,
-
-    /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
-    /// except they ensure that the compare input is zero-extended for
-    /// sub-word versions because the atomic loads zero-extend.
-    ATOMIC_CMP_SWAP_8,
-    ATOMIC_CMP_SWAP_16,
-
-    /// CHAIN,Glue = STORE_COND CHAIN, GPR, Ptr
-    /// The store conditional instruction ST[BHWD]ARX that produces a glue
-    /// result to attach it to a conditional branch.
-    STORE_COND,
-
-    /// GPRC = TOC_ENTRY GA, TOC
-    /// Loads the entry for GA from the TOC, where the TOC base is given by
-    /// the last operand.
-    TOC_ENTRY,
-    LAST_MEMORY_OPCODE = TOC_ENTRY,
-  };
-
-  } // end namespace PPCISD
-
   /// Define some predicates that are used for node matching.
   namespace PPC {
 
@@ -752,10 +178,6 @@ namespace llvm {
     explicit PPCTargetLowering(const PPCTargetMachine &TM,
                                const PPCSubtarget &STI);
 
-    /// getTargetNodeName() - This method returns the name of a target specific
-    /// DAG node.
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
     bool isSelectSupported(SelectSupportKind Kind) const override {
       // PowerPC does not support scalar condition selects on vectors.
       return (Kind != SelectSupportKind::ScalarCondVectorVal);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index f3998113ddd52..3ecc58c04e378 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -149,28 +149,49 @@ def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [
 
 def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+// Square root instruction.
 def PPCfsqrt  : SDNode<"PPCISD::FSQRT",   SDTFPUnaryOp, []>;
+
+// Test instruction for software square root.
 def PPCftsqrt : SDNode<"PPCISD::FTSQRT",  SDT_PPCFtsqrt,[]>;
 
+// FCFID - The FCFID instruction, taking an f64 operand and producing
+// and f64 value containing the FP representation of the integer that
+// was temporarily in the f64 operand.
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
+
+// Newer FCFID[US] integer-to-floating-point conversion instructions for
+// unsigned integers and single-precision outputs.
 def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
 def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
 def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+
+// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+// operand, producing an f64 value containing the integer representation
+// of that FP value.
 def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
 def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+
+// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+// unsigned integers with round toward zero.
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 
+// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
 def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
 
-def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
-                             SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
-                             SDTFPRoundOp, [SDNPHasChain]>;
-def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
-                              SDTFPRoundOp, [SDNPHasChain]>;
+// Constrained integer-to-floating-point conversion instructions.
+let IsStrictFP = true in {
+  def PPCstrict_fcfid  : SDNode<"PPCISD::STRICT_FCFID",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
+                                SDTFPRoundOp, [SDNPHasChain]>;
+  def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
+                                 SDTFPRoundOp, [SDNPHasChain]>;
+}
 
 def PPCany_fcfid : PatFrags<(ops node:$op),
                              [(PPCfcfid node:$op),
@@ -185,28 +206,56 @@ def PPCany_fcfidus : PatFrags<(ops node:$op),
                               [(PPCfcfidus node:$op),
                                (PPCstrict_fcfidus node:$op)]>;
 
+// Store scalar integers from VSR.
 def PPCstore_scal_int_from_vsr:
    SDNode<"PPCISD::ST_VSR_SCAL_INT", SDT_PPCstore_scal_int_from_vsr,
            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// STFIWX - The STFIWX instruction.  The first operand is an input token
+// chain, then an f64 value to store, then an address to store it to.
 def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+// load which sign-extends from a 32-bit integer value into the
+// destination 64-bit register.
 def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+// load which zero-extends from a 32-bit integer value into the
+// destination 64-bit register.
 def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
+// integer smaller than 64 bits into a VSR. The integer is zero-extended.
+// This can be used for converting loaded integers to floating point.
 def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// STXSIX - The STXSI[bh]X instruction. The first operand is an input
+// chain, then an f64 value to store, then an address to store it to,
+// followed by a byte-width for the store.
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
+// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
 def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
 
-// Extract FPSCR (not modeled at the DAG level).
+// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
 def PPCmffs   : SDNode<"PPCISD::MFFS",
                        SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>,
                        [SDNPHasChain]>;
 
-// Perform FADD in round-to-zero mode.
+// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+// towards zero.  Used only as part of the long double-to-int
+// conversion sequence.
 def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+// Constrained floating point add in round-to-zero mode.
+let IsStrictFP = true in
 def PPCstrict_faddrtz: SDNode<"PPCISD::STRICT_FADDRTZ", SDTFPBinOp,
                               [SDNPHasChain]>;
 
@@ -214,72 +263,194 @@ def PPCany_faddrtz: PatFrags<(ops node:$lhs, node:$rhs),
                              [(PPCfaddrtz node:$lhs, node:$rhs),
                               (PPCstrict_faddrtz node:$lhs, node:$rhs)]>;
 
+// FSEL - Traditional three-operand fsel node.
 def PPCfsel   : SDNode<"PPCISD::FSEL",
    // Type constraint for fsel.
    SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
                         SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
 def PPCxsmaxc : SDNode<"PPCISD::XSMAXC", SDT_PPCFPMinMax, []>;
 def PPCxsminc : SDNode<"PPCISD::XSMINC", SDT_PPCFPMinMax, []>;
+
+// Hi/Lo - These represent the high and low 16-bit parts of a global
+// address respectively.  These nodes have two operands, the first of
+// which must be a TargetGlobalAddress, and the second of which must be a
+// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+// though these are usually folded into other nodes.
 def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+
+// GPRC = TOC_ENTRY GA, TOC
+// Loads the entry for GA from the TOC, where the TOC base is given by
+// the last operand.
 def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
                          [SDNPMayLoad, SDNPMemOperand]>;
 
+// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+// for non-position independent code on PPC32.
 def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
 
+// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
+// TLS model, produces an ADDIS8 instruction that adds the GOT
+// base to sym\@got\@tprel\@ha.
 def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+
+// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+// TLS model, produces a LD instruction with base register G8RReg
+// and offset sym\@got\@tprel\@l.  This completes the addition that
+// finds the offset of "sym" relative to the thread pointer.
 def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
                             [SDNPMayLoad]>;
+
+// G8RC = ADD_TLS G8RReg, Symbol - Can be used by the initial-exec
+// and local-exec TLS models, produces an ADD instruction that adds
+// the contents of G8RReg to the thread pointer.  Symbol contains a
+// relocation sym\@tls which is to be replaced by the thread pointer
+// and identifies to the linker that the instruction is part of a
+// TLS sequence.
 def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+
+// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
+// model, produces an ADDIS8 instruction that adds the GOT base
+// register to sym\@got\@tlsgd\@ha.
 def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+
+// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
+// ADDIS_TLSGD_L_ADDR until after register assignment.
 def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+
+// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
+// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
+// ADDIS_TLSGD_L_ADDR until after register assignment.
 def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+
+// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
+// produces a call to .__tls_get_mod(_$TLSML\@ml).
 def PPCgetTlsMod   : SDNode<"PPCISD::GET_TLS_MOD_AIX", SDTIntUnaryOp>;
+
+// %x3 = GET_TPOINTER - Used for the local- and initial-exec TLS model on
+// 32-bit AIX, produces a call to .__get_tpointer to retrieve the thread
+// pointer. At the end of the call, the thread pointer is found in R3.
 def PPCgetTpointer : SDNode<"PPCISD::GET_TPOINTER", SDTIntLeaf, []>;
+
+// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
+// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
+// register assignment.
 def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
                                SDTypeProfile<1, 3, [
                                  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+
+// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+// Op that combines two register copies of TOC entries
+// (region handle into R3 and variable offset into R4) followed by a
+// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
+// This node is used in 64-bit mode as well (in which case the result is
+// G8RC and inputs are X3/X4).
 def PPCTlsgdAIX     : SDNode<"PPCISD::TLSGD_AIX", SDTIntBinOp>;
+
+// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
+// Op that requires a single input of the module handle TOC entry in R3,
+// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
+// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
+// The only difference is the register class.
 def PPCTlsldAIX     : SDNode<"PPCISD::TLSLD_AIX", SDTIntUnaryOp>;
+
+// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
+// model, produces an ADDIS8 instruction that adds the GOT base
+// register to sym\@got\@tlsld\@ha.
 def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+
+// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
+// ADDIS_TLSLD_L_ADDR until after register assignment.
 def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+
+// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
+// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
+// ADDIS_TLSLD_L_ADDR until after register assignment.
 def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+
+// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
+// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
+// following register assignment.
 def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
                                SDTypeProfile<1, 3, [
                                  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+
+// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
+// model, produces an ADDIS8 instruction that adds X3 to
+// sym\@dtprel\@ha.
 def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
+
+// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@dtprel\@l.
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
+// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
 def PPCpaddiDtprel   : SDNode<"PPCISD::PADDI_DTPREL", SDTIntBinOp>;
 
+// VPERM - The PPC VPERM Instruction.
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// XXSPLT - The PPC VSX splat instructions
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+
+// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
+// converting immediate single precision numbers to double precision
+// vector or scalar.
 def PPCxxspltidp : SDNode<"PPCISD::XXSPLTI_SP_TO_DP", SDT_PPCSpToDp, []>;
+
+// VECINSERT - The PPC vector insert instruction
 def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
+
+// XXPERMDI - The PPC XXPERMDI instruction
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
+
+// VECSHL - The PPC vector shift left instruction
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
+// The CMPB instruction (takes two operands of i32 or i64).
 def PPCcmpb     : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
 
 // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
 // amounts.  These nodes are generated by the multi-precision shift code.
+//
+// For scalar types, only the last `n + 1` bits of the shift amounts
+// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+// for exact behaviors.
+//
+// For vector types, only the last n bits are used. See vsld.
 def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
 def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
 def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
 
+// FNMSUB - Negated multiply-subtract instruction.
 def PPCfnmsub     : SDNode<"PPCISD::FNMSUB"    , SDTFPTernaryOp>;
 
+// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
+// word and shift left immediate.
 def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>;
 
-def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
-                               SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+// Constrained conversion from floating point to int
+let IsStrictFP = true in {
+  def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
                                 SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
+                                 SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+                                 SDTFPUnaryOp, [SDNPHasChain]>;
+}
 
 def PPCany_fctidz : PatFrags<(ops node:$op),
                              [(PPCstrict_fctidz node:$op),
@@ -294,19 +465,24 @@ def PPCany_fctiwuz : PatFrags<(ops node:$op),
                               [(PPCstrict_fctiwuz node:$op),
                                (PPCfctiwuz node:$op)]>;
 
-// Move 2 i64 values into a VSX register
+// Direct move of 2 consecutive GPR to a VSX register.
 def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
                            SDTypeProfile<1, 2,
                              [SDTCisFP<0>, SDTCisSameSizeAs<1,2>,
                               SDTCisSameAs<1,2>]>,
                            []>;
 
+// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
+// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
+// unsupported for this target.
+// Merge 2 GPRs to a single SPE register.
 def PPCbuild_spe64: SDNode<"PPCISD::BUILD_SPE64",
                            SDTypeProfile<1, 2,
                              [SDTCisVT<0, f64>, SDTCisVT<1,i32>,
                              SDTCisVT<1,i32>]>,
                            []>;
 
+// Extract SPE register component, second argument is high or low.
 def PPCextract_spe : SDNode<"PPCISD::EXTRACT_SPE",
                             SDTypeProfile<1, 2,
                               [SDTCisVT<0, i32>, SDTCisVT<1, f64>,
@@ -320,6 +496,11 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
 def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+
+// CALL - A direct function call.
+// CALL_NOP is a call with the special NOP which follows 64-bit
+// CALL_NOTOC the caller does not use the TOC.
+// SVR4 calls and 32-bit/64-bit AIX calls.
 def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
@@ -329,17 +510,28 @@ def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
 def PPCcall_notoc : SDNode<"PPCISD::CALL_NOTOC", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPVariadic]>;
+
+// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+// MTCTR instruction.
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+// BCTRL instruction.
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
+
+// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
+// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
+// and 64-bit AIX.
 def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
                                SDTypeProfile<0, 1, []>,
                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                 SDNPVariadic]>;
 
-// Call nodes for strictfp calls (that define RM).
+// The variants that implicitly define rounding mode for calls with
+// strictfp semantics.
 def PPCcall_rm  : SDNode<"PPCISD::CALL_RM", SDT_PPCCall,
                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                           SDNPVariadic]>;
@@ -357,42 +549,81 @@ def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM",
                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                    SDNPVariadic]>;
 
+// Return with a glue operand, matched by 'blr'
 def PPCretglue   : SDNode<"PPCISD::RET_GLUE", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
+// TC_RETURN - A tail call return.
+//   operand #0 chain
+//   operand #1 callee (register or absolute)
+//   operand #2 stack adjustment
+//   operand #3 optional in flag
 def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
+// EH_SJLJ_SETJMP - SjLj exception handling setjmp.
 def PPCeh_sjlj_setjmp  : SDNode<"PPCISD::EH_SJLJ_SETJMP",
                                 SDTypeProfile<1, 1, [SDTCisInt<0>,
                                                      SDTCisPtrTy<1>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
+
+// EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
 def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
 
+// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+// instructions.  For lack of better number, we use the opcode number
+// encoding for the OPC field to identify the compare.  For example, 838
+// is VCMPGTSH.
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+
+// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
+// altivec VCMP*_rec instructions.  For lack of better number, we use the
+// opcode number encoding for the OPC field to identify the compare.  For
+// example, 838 is VCMPGTSH.
 def PPCvcmp_rec   : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
 
+// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+// condition register to branch on, OPC is the branch opcode to use (e.g.
+// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+// an optional input flag argument.
 def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
-// PPC-specific atomic operations.
+// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
+// except they ensure that the compare input is zero-extended for
+// sub-word versions because the atomic loads zero-extend.
 def PPCatomicCmpSwap_8 :
   SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
          [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def PPCatomicCmpSwap_16 :
   SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
          [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+// or i32.
 def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
                            [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+// i32.
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// CHAIN,Glue = STORE_COND CHAIN, GPR, Ptr
+// The store conditional instruction ST[BHWD]ARX that produces a glue
+// result to attach it to a conditional branch.
 def PPCStoreCond  : SDNode<"PPCISD::STORE_COND", SDT_StoreCond,
                            [SDNPHasChain, SDNPMayStore,
                             SDNPMemOperand, SDNPOutGlue]>;
 
-// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
 def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
@@ -401,17 +632,44 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
 def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
+
+// The following two target-specific nodes are used for calls through
+// function pointers in the 64-bit SVR4 ABI.
+
+// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+// compute an allocation on the stack.
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+// compute an offset from native SP to the address  of the most recent
+// dynamic alloca.
 def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
+
+// To avoid stack clash, allocation is performed by block and each block is
+// probed.
 def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>;
 
 // PC Relative Specific Nodes
+
+// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
+// either through an add like PADDI or through a PC Relative load like
+// PLD.
 def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
+
+// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
+// TLS global address when using dynamic access models. This can be done
+// through an add like PADDI.
 def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR",
                                     SDTIntUnaryOp, []>;
+
+// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
+// when using local exec access models, and when prefixed instructions are
+// available. This is used with ADD_TLS to produce an add like PADDI.
 def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR",
                                     SDTIntUnaryOp, []>;
 
+// These nodes represent PPC arithmetic operations with carry.
 def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut,
                      [SDNPCommutative]>;
 def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut,
@@ -2535,6 +2793,7 @@ defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$RST), (ins f8rc:$RA, f8rc:$RB),
 
 // Reciprocal estimates.
 let mayRaiseFPException = 1 in {
+// Reciprocal estimate instructions (unary FP ops).
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$RST), (ins f8rc:$RB),
                           "fre", "$RST, $RB", IIC_FPGeneral,
                           [(set f64:$RST, (PPCfre f64:$RB))]>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 2d8c633b9fef6..bd9a999237c09 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -83,15 +83,31 @@ def SDT_PPCsetbc : SDTypeProfile<1, 1, [
 // ISA 3.1 specific PPCISD nodes.
 //
 
+// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
 def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+
+// ACC_BUILD = Build an accumulator register from 4 VSX registers.
 def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+
+// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
 def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+
+// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
+// an accumulator or pair register. This node is needed because
+// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
+// element type.
 def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
                        []>;
 def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
                         []>;
+
+// XXMFACC = This corresponds to the xxmfacc instruction.
 def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
+
+// SETBC - The ISA 3.1 (P10) SETBC instruction.
 def PPCsetbc  : SDNode<"PPCISD::SETBC",   SDT_PPCsetbc, []>;
+
+// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
 def PPCsetbcr : SDNode<"PPCISD::SETBCR",  SDT_PPCsetbc, []>;
 
 //===----------------------------------------------------------------------===//
@@ -105,7 +121,10 @@ def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
   SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
 ]>;
 
-// PPC Specific DAG Nodes.
+// LXVRZX - Load VSX Vector Rightmost and Zero Extend
+// This node represents v1i128 BUILD_VECTOR of a zero extending load
+// instruction from <byte, halfword, word, or doubleword> to i128.
+// Allows utilization of the Load VSX Vector Rightmost Instructions.
 def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 885bed670e319..d72201df5b002 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -87,31 +87,91 @@ def SDT_PPCxxperm : SDTypeProfile<1, 3, [
   SDTCisVT<0, v2f64>, SDTCisVT<1, v2f64>,
   SDTCisVT<2, v2f64>, SDTCisVT<3, v4i32>]>;
 //--------------------------- Custom PPC nodes -------------------------------//
+
+// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+// Maps directly to an lxvd2x instruction that will be followed by
+// an xxswapd.
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+// Maps directly to an stxvd2x instruction that will be preceded by
+// an xxswapd.
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
+// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
+// the vector type to load vector in big-endian element order.
 def PPCld_vec_be  : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
+// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
+// the vector type to store vector in big-endian element order.
 def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
+// or stxvd2x instruction.  The chain is necessary because the
+// sequence replaces a load and needs to provide the same number
+// of outputs.
 def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+
+// Direct move from a VSX register to a GPR
 def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
+
+// Direct move from a GPR to a VSX register (algebraic)
 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
+
+// Direct move from a GPR to a VSX register (zero)
 def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
+
+// Extract a subvector from signed integer vector and convert to FP.
+// It is primarily used to convert a (widened) illegal integer vector
+// type to a legal floating point vector type.
+// For example v2i32 -> widened to v4i32 -> v2f64
 def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
+
+// Extract a subvector from unsigned integer vector and convert to FP.
+// As with SINT_VEC_TO_FP, used for converting illegal types.
 def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
+
+// An SDNode for swaps that are not associated with any loads/stores
+// and thereby have no chain.
 def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
 
+// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
+// lower (IDX=1) half of v4f32 to v2f64.
 def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
+
+// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
+// v2f32 value into the lower half of a VSR register.
 def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// instructions such as LXVDSX, LXVWSX.
 def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// that zero-extends.
 def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// that sign-extends.
 def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
+// place the value into the least significant element of the most
+// significant doubleword in the vector. This is not element zero for
+// anything smaller than a doubleword on either endianness. This node has
+// the same semantics as SCALAR_TO_VECTOR except that the value remains in
+// the aforementioned location in the vector register.
 def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
                      SDTypeProfile<1, 1, []>, []>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index 93a4693c50168..80aa1122167df 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -7,20 +7,72 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCSelectionDAGInfo.h"
-#include "PPCISelLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+#define GET_SDNODE_DESC
+#include "PPCGenSDNodeInfo.inc"
 
 using namespace llvm;
 
+PPCSelectionDAGInfo::PPCSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(PPCGenSDNodeInfo) {}
+
 PPCSelectionDAGInfo::~PPCSelectionDAGInfo() = default;
 
-bool PPCSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= PPCISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= PPCISD::LAST_MEMORY_OPCODE;
+const char *PPCSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<PPCISD::NodeType>(Opcode)) {
+  case PPCISD::GlobalBaseReg:
+    return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRA_ADDZE:
+    return "PPCISD::SRA_ADDZE";
+  case PPCISD::READ_TIME_BASE:
+    return "PPCISD::READ_TIME_BASE";
+  case PPCISD::MFOCRF:
+    return "PPCISD::MFOCRF";
+  case PPCISD::ANDI_rec_1_EQ_BIT:
+    return "PPCISD::ANDI_rec_1_EQ_BIT";
+  case PPCISD::ANDI_rec_1_GT_BIT:
+    return "PPCISD::ANDI_rec_1_GT_BIT";
+  case PPCISD::BDNZ:
+    return "PPCISD::BDNZ";
+  case PPCISD::BDZ:
+    return "PPCISD::BDZ";
+  case PPCISD::PPC32_PICGOT:
+    return "PPCISD::PPC32_PICGOT";
+  case PPCISD::VADD_SPLAT:
+    return "PPCISD::VADD_SPLAT";
+  }
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
-bool PPCSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
-  return Opcode >= PPCISD::FIRST_STRICTFP_OPCODE &&
-         Opcode <= PPCISD::LAST_STRICTFP_OPCODE;
+void PPCSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+                                           const SDNode *N) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case PPCISD::DYNAREAOFFSET:
+    // invalid number of results; expected 2, got 1
+  case PPCISD::TOC_ENTRY:
+    // invalid number of results; expected 1, got 2
+  case PPCISD::STORE_COND:
+    // invalid number of results; expected 2, got 3
+  case PPCISD::LD_SPLAT:
+  case PPCISD::SEXT_LD_SPLAT:
+  case PPCISD::ZEXT_LD_SPLAT:
+    // invalid number of operands; expected 2, got 3
+  case PPCISD::ST_VSR_SCAL_INT:
+    // invalid number of operands; expected 4, got 5
+  case PPCISD::XXPERM:
+    // operand #1 must have type v2f64, but has type v16i8
+  case PPCISD::ACC_BUILD:
+    // operand #3 must have type v4i32, but has type v16i8
+  case PPCISD::PAIR_BUILD:
+    // operand #1 must have type v4i32, but has type v16i8
+    return;
+  }
+
+  SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
 }
 
 std::pair<SDValue, SDValue> PPCSelectionDAGInfo::EmitTargetCodeForMemcmp(
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index f962a7a5321aa..ffe8982ce1af4 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -11,15 +11,66 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "PPCGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace PPCISD {
+
+enum NodeType : unsigned {
+  /// The result of the mflr at function entry, used for PIC code.
+  GlobalBaseReg = GENERATED_OPCODE_END,
+
+  /// The combination of sra[wd]i and addze used to implemented signed
+  /// integer division by a power of 2. The first operand is the dividend,
+  /// and the second is the constant shift amount (representing the
+  /// divisor).
+  SRA_ADDZE,
+
+  /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+  /// This copies the bits corresponding to the specified CRREG into the
+  /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+  MFOCRF,
+
+  // FIXME: Remove these once the ANDI glue bug is fixed:
+  /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+  /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+  /// implement truncation of i32 or i64 to i1.
+  ANDI_rec_1_EQ_BIT,
+  ANDI_rec_1_GT_BIT,
+
+  // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
+  // target (returns (Lo, Hi)). It takes a chain operand.
+  READ_TIME_BASE,
 
-class PPCSelectionDAGInfo : public SelectionDAGTargetInfo {
+  /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+  /// loops.
+  BDNZ,
+  BDZ,
+
+  /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+  /// local dynamic TLS and position indendepent code on PPC32.
+  PPC32_PICGOT,
+
+  /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+  /// during instruction selection to optimize a BUILD_VECTOR into
+  /// operations on splats.  This is necessary to avoid losing these
+  /// optimizations due to constant folding.
+  VADD_SPLAT,
+};
+
+} // namespace PPCISD
+
+class PPCSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  PPCSelectionDAGInfo();
+
   ~PPCSelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
-  bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+  void verifyTargetNode(const SelectionDAG &DAG,
+                        const SDNode *N) const override;
 
   std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f313d3f1347d4..fb298ee35d6c2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16798,9 +16798,7 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
     // because X is exact (Y >> M + 2).
     uint64_t ShAmt = Log2_64(MulAmtLowBit) + 2;
     using namespace SDPatternMatch;
-    return sd_match(X, m_AnyOf(m_Sra(m_Value(), m_SpecificInt(ShAmt)),
-                               m_Srl(m_Value(), m_SpecificInt(ShAmt)))) &&
-           X->getFlags().hasExact();
+    return sd_match(X, m_ExactSr(m_Value(), m_SpecificInt(ShAmt)));
   };
   if (isPowerOf2_64(MulAmt - MulAmtLowBit) && !(CanSub && PreferSub())) {
     Op = ISD::ADD;
@@ -16825,10 +16823,13 @@ static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
-  // Put the shift first if we can fold a zext into the shift forming a slli.uw.
+  // Put the shift first if we can fold:
+  // a. a zext into the shift forming a slli.uw
+  // b. an exact shift right forming one shorter shift or no shift at all
   using namespace SDPatternMatch;
   if (Shift != 0 &&
-      sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
+      sd_match(X, m_AnyOf(m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))),
+                          m_ExactSr(m_Value(), m_ConstInt())))) {
     X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
     Shift = 0;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index d17528dd882bf..751ae0fe34d33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -17,7 +17,8 @@
 #include "SPIRV.h"
 #include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Support/Debug.h"
 #include <stack>
 
 #define DEBUG_TYPE "spirv-postlegalizer"
@@ -43,79 +44,314 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
                          SPIRVType *KnownResType);
 } // namespace llvm
 
-static bool mayBeInserted(unsigned Opcode) {
-  switch (Opcode) {
-  case TargetOpcode::G_SMAX:
-  case TargetOpcode::G_UMAX:
-  case TargetOpcode::G_SMIN:
-  case TargetOpcode::G_UMIN:
-  case TargetOpcode::G_FMINNUM:
-  case TargetOpcode::G_FMINIMUM:
-  case TargetOpcode::G_FMAXNUM:
-  case TargetOpcode::G_FMAXIMUM:
-    return true;
+static SPIRVType *deduceIntTypeFromResult(Register ResVReg,
+                                          MachineIRBuilder &MIB,
+                                          SPIRVGlobalRegistry *GR) {
+  const LLT &Ty = MIB.getMRI()->getType(ResVReg);
+  return GR->getOrCreateSPIRVIntegerType(Ty.getScalarSizeInBits(), MIB);
+}
+
+static bool deduceAndAssignTypeForGUnmerge(MachineInstr *I, MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register SrcReg = I->getOperand(I->getNumOperands() - 1).getReg();
+  SPIRVType *ScalarType = nullptr;
+  if (SPIRVType *DefType = GR->getSPIRVTypeForVReg(SrcReg)) {
+    assert(DefType->getOpcode() == SPIRV::OpTypeVector);
+    ScalarType = GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
+  }
+
+  if (!ScalarType) {
+    // If we could not deduce the type from the source, try to deduce it from
+    // the uses of the results.
+    for (unsigned i = 0; i < I->getNumDefs() && !ScalarType; ++i) {
+      for (const auto &Use :
+           MRI.use_nodbg_instructions(I->getOperand(i).getReg())) {
+        assert(Use.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+               "Expected use of G_UNMERGE_VALUES to be a G_BUILD_VECTOR");
+        if (auto *VecType =
+                GR->getSPIRVTypeForVReg(Use.getOperand(0).getReg())) {
+          ScalarType = GR->getScalarOrVectorComponentType(VecType);
+          break;
+        }
+      }
+    }
+  }
+
+  if (!ScalarType)
+    return false;
+
+  for (unsigned i = 0; i < I->getNumDefs(); ++i) {
+    Register DefReg = I->getOperand(i).getReg();
+    if (GR->getSPIRVTypeForVReg(DefReg))
+      continue;
+
+    LLT DefLLT = MRI.getType(DefReg);
+    SPIRVType *ResType =
+        DefLLT.isVector()
+            ? GR->getOrCreateSPIRVVectorType(
+                  ScalarType, DefLLT.getNumElements(), *I,
+                  *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo())
+            : ScalarType;
+    setRegClassType(DefReg, ResType, GR, &MRI, MF);
+  }
+  return true;
+}
+
+static SPIRVType *deduceTypeFromSingleOperand(MachineInstr *I,
+                                              MachineIRBuilder &MIB,
+                                              SPIRVGlobalRegistry *GR,
+                                              unsigned OpIdx) {
+  Register OpReg = I->getOperand(OpIdx).getReg();
+  if (SPIRVType *OpType = GR->getSPIRVTypeForVReg(OpReg)) {
+    if (SPIRVType *CompType = GR->getScalarOrVectorComponentType(OpType)) {
+      Register ResVReg = I->getOperand(0).getReg();
+      const LLT &ResLLT = MIB.getMRI()->getType(ResVReg);
+      if (ResLLT.isVector())
+        return GR->getOrCreateSPIRVVectorType(CompType, ResLLT.getNumElements(),
+                                              MIB, false);
+      return CompType;
+    }
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceTypeFromOperandRange(MachineInstr *I,
+                                             MachineIRBuilder &MIB,
+                                             SPIRVGlobalRegistry *GR,
+                                             unsigned StartOp, unsigned EndOp) {
+  SPIRVType *ResType = nullptr;
+  for (unsigned i = StartOp; i < EndOp; ++i) {
+    if (SPIRVType *Type = deduceTypeFromSingleOperand(I, MIB, GR, i)) {
+#ifdef EXPENSIVE_CHECKS
+      assert(!ResType || Type == ResType && "Conflicting type from operands.");
+      ResType = Type;
+#else
+      return Type;
+#endif
+    }
+  }
+  return ResType;
+}
+
+static SPIRVType *deduceTypeForResultRegister(MachineInstr *Use,
+                                              Register UseRegister,
+                                              SPIRVGlobalRegistry *GR,
+                                              MachineIRBuilder &MIB) {
+  for (const MachineOperand &MO : Use->defs()) {
+    if (!MO.isReg())
+      continue;
+    if (SPIRVType *OpType = GR->getSPIRVTypeForVReg(MO.getReg())) {
+      if (SPIRVType *CompType = GR->getScalarOrVectorComponentType(OpType)) {
+        const LLT &ResLLT = MIB.getMRI()->getType(UseRegister);
+        if (ResLLT.isVector())
+          return GR->getOrCreateSPIRVVectorType(
+              CompType, ResLLT.getNumElements(), MIB, false);
+        return CompType;
+      }
+    }
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceTypeFromUses(Register Reg, MachineFunction &MF,
+                                     SPIRVGlobalRegistry *GR,
+                                     MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg)) {
+    SPIRVType *ResType = nullptr;
+    switch (Use.getOpcode()) {
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    case TargetOpcode::G_UNMERGE_VALUES:
+      LLVM_DEBUG(dbgs() << "Looking at use " << Use << "\n");
+      ResType = deduceTypeForResultRegister(&Use, Reg, GR, MIB);
+      break;
+    }
+    if (ResType)
+      return ResType;
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceResultTypeFromOperands(MachineInstr *I,
+                                               SPIRVGlobalRegistry *GR,
+                                               MachineIRBuilder &MIB) {
+  Register ResVReg = I->getOperand(0).getReg();
+  switch (I->getOpcode()) {
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_ANYEXT:
+    return deduceIntTypeFromResult(ResVReg, MIB, GR);
+  case TargetOpcode::G_BUILD_VECTOR:
+    return deduceTypeFromOperandRange(I, MIB, GR, 1, I->getNumOperands());
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return deduceTypeFromOperandRange(I, MIB, GR, 1, 3);
   default:
-    return isTypeFoldingSupported(Opcode);
+    if (I->getNumDefs() == 1 && I->getNumOperands() > 1 &&
+        I->getOperand(1).isReg())
+      return deduceTypeFromSingleOperand(I, MIB, GR, 1);
+    return nullptr;
   }
 }
 
-static void processNewInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                             MachineIRBuilder MIB) {
+static bool deduceAndAssignSpirvType(MachineInstr *I, MachineFunction &MF,
+                                     SPIRVGlobalRegistry *GR,
+                                     MachineIRBuilder &MIB) {
+  LLVM_DEBUG(dbgs() << "\nProcessing instruction: " << *I);
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register ResVReg = I->getOperand(0).getReg();
+
+  // G_UNMERGE_VALUES is handled separately because it has multiple definitions,
+  // unlike the other instructions which have a single result register. The main
+  // deduction logic is designed for the single-definition case.
+  if (I->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
+    return deduceAndAssignTypeForGUnmerge(I, MF, GR);
+
+  LLVM_DEBUG(dbgs() << "Inferring type from operands\n");
+  SPIRVType *ResType = deduceResultTypeFromOperands(I, GR, MIB);
+  if (!ResType) {
+    LLVM_DEBUG(dbgs() << "Inferring type from uses\n");
+    ResType = deduceTypeFromUses(ResVReg, MF, GR, MIB);
+  }
+
+  if (!ResType)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Assigned type to " << *I << ": " << *ResType);
+  GR->assignSPIRVTypeToVReg(ResType, ResVReg, MF);
 
+  if (!MRI.getRegClassOrNull(ResVReg)) {
+    LLVM_DEBUG(dbgs() << "Updating the register class.\n");
+    setRegClassType(ResVReg, ResType, GR, &MRI, *GR->CurMF, true);
+  }
+  return true;
+}
+
+static bool requiresSpirvType(MachineInstr &I, SPIRVGlobalRegistry *GR,
+                              MachineRegisterInfo &MRI) {
+  LLVM_DEBUG(dbgs() << "Checking if instruction requires a SPIR-V type: "
+                    << I;);
+  if (I.getNumDefs() == 0) {
+    LLVM_DEBUG(dbgs() << "Instruction does not have a definition.\n");
+    return false;
+  }
+
+  if (!I.isPreISelOpcode()) {
+    LLVM_DEBUG(dbgs() << "Instruction is not a generic instruction.\n");
+    return false;
+  }
+
+  Register ResultRegister = I.defs().begin()->getReg();
+  if (GR->getSPIRVTypeForVReg(ResultRegister)) {
+    LLVM_DEBUG(dbgs() << "Instruction already has a SPIR-V type.\n");
+    if (!MRI.getRegClassOrNull(ResultRegister)) {
+      LLVM_DEBUG(dbgs() << "Updating the register class.\n");
+      setRegClassType(ResultRegister, GR->getSPIRVTypeForVReg(ResultRegister),
+                      GR, &MRI, *GR->CurMF, true);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static void registerSpirvTypeForNewInstructions(MachineFunction &MF,
+                                                SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<MachineInstr *, 8> Worklist;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &I : MBB) {
-      const unsigned Opcode = I.getOpcode();
-      if (Opcode == TargetOpcode::G_UNMERGE_VALUES) {
-        unsigned ArgI = I.getNumOperands() - 1;
-        Register SrcReg = I.getOperand(ArgI).isReg()
-                              ? I.getOperand(ArgI).getReg()
-                              : Register(0);
-        SPIRVType *DefType =
-            SrcReg.isValid() ? GR->getSPIRVTypeForVReg(SrcReg) : nullptr;
-        if (!DefType || DefType->getOpcode() != SPIRV::OpTypeVector)
-          report_fatal_error(
-              "cannot select G_UNMERGE_VALUES with a non-vector argument");
-        SPIRVType *ScalarType =
-            GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
-        for (unsigned i = 0; i < I.getNumDefs(); ++i) {
-          Register ResVReg = I.getOperand(i).getReg();
-          SPIRVType *ResType = GR->getSPIRVTypeForVReg(ResVReg);
-          if (!ResType) {
-            // There was no "assign type" actions, let's fix this now
-            ResType = ScalarType;
-            setRegClassType(ResVReg, ResType, GR, &MRI, *GR->CurMF, true);
-          }
-        }
-      } else if (mayBeInserted(Opcode) && I.getNumDefs() == 1 &&
-                 I.getNumOperands() > 1 && I.getOperand(1).isReg()) {
-        // Legalizer may have added a new instructions and introduced new
-        // registers, we must decorate them as if they were introduced in a
-        // non-automatic way
-        Register ResVReg = I.getOperand(0).getReg();
-        // Check if the register defined by the instruction is newly generated
-        // or already processed
-        // Check if we have type defined for operands of the new instruction
-        bool IsKnownReg = MRI.getRegClassOrNull(ResVReg);
-        SPIRVType *ResVType = GR->getSPIRVTypeForVReg(
-            IsKnownReg ? ResVReg : I.getOperand(1).getReg());
-        if (!ResVType)
-          continue;
-        // Set type & class
-        if (!IsKnownReg)
-          setRegClassType(ResVReg, ResVType, GR, &MRI, *GR->CurMF, true);
-        // If this is a simple operation that is to be reduced by TableGen
-        // definition we must apply some of pre-legalizer rules here
-        if (isTypeFoldingSupported(Opcode)) {
-          processInstr(I, MIB, MRI, GR, GR->getSPIRVTypeForVReg(ResVReg));
-          if (IsKnownReg && MRI.hasOneUse(ResVReg)) {
-            MachineInstr &UseMI = *MRI.use_instr_begin(ResVReg);
-            if (UseMI.getOpcode() == SPIRV::ASSIGN_TYPE)
-              continue;
-          }
-          insertAssignInstr(ResVReg, nullptr, ResVType, GR, MIB, MRI);
+      if (requiresSpirvType(I, GR, MRI)) {
+        Worklist.push_back(&I);
+      }
+    }
+  }
+
+  if (Worklist.empty()) {
+    LLVM_DEBUG(dbgs() << "Initial worklist is empty.\n");
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Initial worklist:\n";
+             for (auto *I : Worklist) { I->dump(); });
+
+  bool Changed;
+  do {
+    Changed = false;
+    SmallVector<MachineInstr *, 8> NextWorklist;
+
+    for (MachineInstr *I : Worklist) {
+      MachineIRBuilder MIB(*I);
+      if (deduceAndAssignSpirvType(I, MF, GR, MIB)) {
+        Changed = true;
+      } else {
+        NextWorklist.push_back(I);
+      }
+    }
+    Worklist = std::move(NextWorklist);
+    LLVM_DEBUG(dbgs() << "Worklist size: " << Worklist.size() << "\n");
+  } while (Changed);
+
+  if (Worklist.empty())
+    return;
+
+  for (auto *I : Worklist) {
+    MachineIRBuilder MIB(*I);
+    Register ResVReg = I->getOperand(0).getReg();
+    const LLT &ResLLT = MRI.getType(ResVReg);
+    SPIRVType *ResType = nullptr;
+    if (ResLLT.isVector()) {
+      SPIRVType *CompType = GR->getOrCreateSPIRVIntegerType(
+          ResLLT.getElementType().getSizeInBits(), MIB);
+      ResType = GR->getOrCreateSPIRVVectorType(
+          CompType, ResLLT.getNumElements(), MIB, false);
+    } else {
+      ResType = GR->getOrCreateSPIRVIntegerType(ResLLT.getSizeInBits(), MIB);
+    }
+    LLVM_DEBUG(dbgs() << "Could not determine type for " << *I
+                      << ", defaulting to " << *ResType << "\n");
+    setRegClassType(ResVReg, ResType, GR, &MRI, MF, true);
+  }
+}
+
+static void ensureAssignTypeForTypeFolding(MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR) {
+  LLVM_DEBUG(dbgs() << "Entering ensureAssignTypeForTypeFolding for function "
+                    << MF.getName() << "\n");
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isTypeFoldingSupported(MI.getOpcode()))
+        continue;
+      if (MI.getNumOperands() == 1 || !MI.getOperand(1).isReg())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Processing instruction: " << MI);
+
+      // Check uses of MI to see if it already has an use in SPIRV::ASSIGN_TYPE
+      bool HasAssignType = false;
+      Register ResultRegister = MI.defs().begin()->getReg();
+      // All uses of Result register
+      for (MachineInstr &UseInstr :
+           MRI.use_nodbg_instructions(ResultRegister)) {
+        if (UseInstr.getOpcode() == SPIRV::ASSIGN_TYPE) {
+          HasAssignType = true;
+          LLVM_DEBUG(dbgs() << "  Instruction already has an ASSIGN_TYPE use: "
+                            << UseInstr);
+          break;
         }
       }
+
+      if (!HasAssignType) {
+        Register ResultRegister = MI.defs().begin()->getReg();
+        SPIRVType *ResultType = GR->getSPIRVTypeForVReg(ResultRegister);
+        LLVM_DEBUG(
+            dbgs() << "  Adding ASSIGN_TYPE for ResultRegister: "
+                   << printReg(ResultRegister, MRI.getTargetRegisterInfo())
+                   << " with type: " << *ResultType);
+        MachineIRBuilder MIB(MI);
+        insertAssignInstr(ResultRegister, nullptr, ResultType, GR, MIB, MRI);
+      }
     }
   }
 }
@@ -155,10 +391,8 @@ bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) {
   const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>();
   SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
   GR->setCurrentFunc(MF);
-  MachineIRBuilder MIB(MF);
-
-  processNewInstrs(MF, GR, MIB);
-
+  registerSpirvTypeForNewInstructions(MF, GR);
+  ensureAssignTypeForTypeFolding(MF, GR);
   return true;
 }
 
diff --git a/llvm/lib/Target/SystemZ/CMakeLists.txt b/llvm/lib/Target/SystemZ/CMakeLists.txt
index 0d8f3eac6ee4f..6d94a755322df 100644
--- a/llvm/lib/Target/SystemZ/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM SystemZGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM SystemZGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(SystemZCommonTableGen)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 58109acc92015..dfd76f9b0427f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7423,153 +7423,6 @@ SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
   return LowerOperationWrapper(N, Results, DAG);
 }
 
-const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
-#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
-  switch ((SystemZISD::NodeType)Opcode) {
-    case SystemZISD::FIRST_NUMBER: break;
-    OPCODE(RET_GLUE);
-    OPCODE(CALL);
-    OPCODE(SIBCALL);
-    OPCODE(TLS_GDCALL);
-    OPCODE(TLS_LDCALL);
-    OPCODE(PCREL_WRAPPER);
-    OPCODE(PCREL_OFFSET);
-    OPCODE(ICMP);
-    OPCODE(FCMP);
-    OPCODE(STRICT_FCMP);
-    OPCODE(STRICT_FCMPS);
-    OPCODE(TM);
-    OPCODE(BR_CCMASK);
-    OPCODE(SELECT_CCMASK);
-    OPCODE(ADJDYNALLOC);
-    OPCODE(PROBED_ALLOCA);
-    OPCODE(POPCNT);
-    OPCODE(SMUL_LOHI);
-    OPCODE(UMUL_LOHI);
-    OPCODE(SDIVREM);
-    OPCODE(UDIVREM);
-    OPCODE(SADDO);
-    OPCODE(SSUBO);
-    OPCODE(UADDO);
-    OPCODE(USUBO);
-    OPCODE(ADDCARRY);
-    OPCODE(SUBCARRY);
-    OPCODE(GET_CCMASK);
-    OPCODE(MVC);
-    OPCODE(NC);
-    OPCODE(OC);
-    OPCODE(XC);
-    OPCODE(CLC);
-    OPCODE(MEMSET_MVC);
-    OPCODE(STPCPY);
-    OPCODE(STRCMP);
-    OPCODE(SEARCH_STRING);
-    OPCODE(IPM);
-    OPCODE(TBEGIN);
-    OPCODE(TBEGIN_NOFLOAT);
-    OPCODE(TEND);
-    OPCODE(BYTE_MASK);
-    OPCODE(ROTATE_MASK);
-    OPCODE(REPLICATE);
-    OPCODE(JOIN_DWORDS);
-    OPCODE(SPLAT);
-    OPCODE(MERGE_HIGH);
-    OPCODE(MERGE_LOW);
-    OPCODE(SHL_DOUBLE);
-    OPCODE(PERMUTE_DWORDS);
-    OPCODE(PERMUTE);
-    OPCODE(PACK);
-    OPCODE(PACKS_CC);
-    OPCODE(PACKLS_CC);
-    OPCODE(UNPACK_HIGH);
-    OPCODE(UNPACKL_HIGH);
-    OPCODE(UNPACK_LOW);
-    OPCODE(UNPACKL_LOW);
-    OPCODE(VSHL_BY_SCALAR);
-    OPCODE(VSRL_BY_SCALAR);
-    OPCODE(VSRA_BY_SCALAR);
-    OPCODE(VROTL_BY_SCALAR);
-    OPCODE(SHL_DOUBLE_BIT);
-    OPCODE(SHR_DOUBLE_BIT);
-    OPCODE(VSUM);
-    OPCODE(VACC);
-    OPCODE(VSCBI);
-    OPCODE(VAC);
-    OPCODE(VSBI);
-    OPCODE(VACCC);
-    OPCODE(VSBCBI);
-    OPCODE(VMAH);
-    OPCODE(VMALH);
-    OPCODE(VME);
-    OPCODE(VMLE);
-    OPCODE(VMO);
-    OPCODE(VMLO);
-    OPCODE(VICMPE);
-    OPCODE(VICMPH);
-    OPCODE(VICMPHL);
-    OPCODE(VICMPES);
-    OPCODE(VICMPHS);
-    OPCODE(VICMPHLS);
-    OPCODE(VFCMPE);
-    OPCODE(STRICT_VFCMPE);
-    OPCODE(STRICT_VFCMPES);
-    OPCODE(VFCMPH);
-    OPCODE(STRICT_VFCMPH);
-    OPCODE(STRICT_VFCMPHS);
-    OPCODE(VFCMPHE);
-    OPCODE(STRICT_VFCMPHE);
-    OPCODE(STRICT_VFCMPHES);
-    OPCODE(VFCMPES);
-    OPCODE(VFCMPHS);
-    OPCODE(VFCMPHES);
-    OPCODE(VFTCI);
-    OPCODE(VEXTEND);
-    OPCODE(STRICT_VEXTEND);
-    OPCODE(VROUND);
-    OPCODE(STRICT_VROUND);
-    OPCODE(VTM);
-    OPCODE(SCMP128HI);
-    OPCODE(UCMP128HI);
-    OPCODE(VFAE_CC);
-    OPCODE(VFAEZ_CC);
-    OPCODE(VFEE_CC);
-    OPCODE(VFEEZ_CC);
-    OPCODE(VFENE_CC);
-    OPCODE(VFENEZ_CC);
-    OPCODE(VISTR_CC);
-    OPCODE(VSTRC_CC);
-    OPCODE(VSTRCZ_CC);
-    OPCODE(VSTRS_CC);
-    OPCODE(VSTRSZ_CC);
-    OPCODE(TDC);
-    OPCODE(ATOMIC_SWAPW);
-    OPCODE(ATOMIC_LOADW_ADD);
-    OPCODE(ATOMIC_LOADW_SUB);
-    OPCODE(ATOMIC_LOADW_AND);
-    OPCODE(ATOMIC_LOADW_OR);
-    OPCODE(ATOMIC_LOADW_XOR);
-    OPCODE(ATOMIC_LOADW_NAND);
-    OPCODE(ATOMIC_LOADW_MIN);
-    OPCODE(ATOMIC_LOADW_MAX);
-    OPCODE(ATOMIC_LOADW_UMIN);
-    OPCODE(ATOMIC_LOADW_UMAX);
-    OPCODE(ATOMIC_CMP_SWAPW);
-    OPCODE(ATOMIC_CMP_SWAP);
-    OPCODE(ATOMIC_LOAD_128);
-    OPCODE(ATOMIC_STORE_128);
-    OPCODE(ATOMIC_CMP_SWAP_128);
-    OPCODE(LRV);
-    OPCODE(STRV);
-    OPCODE(VLER);
-    OPCODE(VSTER);
-    OPCODE(STCKF);
-    OPCODE(PREFETCH);
-    OPCODE(ADA_ENTRY);
-  }
-  return nullptr;
-#undef OPCODE
-}
-
 // Return true if VT is a vector whose elements are a whole number of bytes
 // in width. Also check for presence of vector support.
 bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index d5b76031766dd..13a1cd1614a53 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -22,390 +22,6 @@
 #include <optional>
 
 namespace llvm {
-namespace SystemZISD {
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-  // Return with a glue operand.  Operand 0 is the chain operand.
-  RET_GLUE,
-
-  // Calls a function.  Operand 0 is the chain operand and operand 1
-  // is the target address.  The arguments start at operand 2.
-  // There is an optional glue operand at the end.
-  CALL,
-  SIBCALL,
-
-  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
-  // (The call target is implicitly __tls_get_offset.)
-  TLS_GDCALL,
-  TLS_LDCALL,
-
-  // Wraps a TargetGlobalAddress that should be loaded using PC-relative
-  // accesses (LARL).  Operand 0 is the address.
-  PCREL_WRAPPER,
-
-  // Used in cases where an offset is applied to a TargetGlobalAddress.
-  // Operand 0 is the full TargetGlobalAddress and operand 1 is a
-  // PCREL_WRAPPER for an anchor point.  This is used so that we can
-  // cheaply refer to either the full address or the anchor point
-  // as a register base.
-  PCREL_OFFSET,
-
-  // Integer comparisons.  There are three operands: the two values
-  // to compare, and an integer of type SystemZICMP.
-  ICMP,
-
-  // Floating-point comparisons.  The two operands are the values to compare.
-  FCMP,
-
-  // Test under mask.  The first operand is ANDed with the second operand
-  // and the condition codes are set on the result.  The third operand is
-  // a boolean that is true if the condition codes need to distinguish
-  // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
-  // register forms do but the memory forms don't).
-  TM,
-
-  // Branches if a condition is true.  Operand 0 is the chain operand;
-  // operand 1 is the 4-bit condition-code mask, with bit N in
-  // big-endian order meaning "branch if CC=N"; operand 2 is the
-  // target block and operand 3 is the flag operand.
-  BR_CCMASK,
-
-  // Selects between operand 0 and operand 1.  Operand 2 is the
-  // mask of condition-code values for which operand 0 should be
-  // chosen over operand 1; it has the same form as BR_CCMASK.
-  // Operand 3 is the flag operand.
-  SELECT_CCMASK,
-
-  // Evaluates to the gap between the stack pointer and the
-  // base of the dynamically-allocatable area.
-  ADJDYNALLOC,
-
-  // For allocating stack space when using stack clash protector.
-  // Allocation is performed by block, and each block is probed.
-  PROBED_ALLOCA,
-
-  // Count number of bits set in operand 0 per byte.
-  POPCNT,
-
-  // Wrappers around the ISD opcodes of the same name.  The output is GR128.
-  // Input operands may be GR64 or GR32, depending on the instruction.
-  SMUL_LOHI,
-  UMUL_LOHI,
-  SDIVREM,
-  UDIVREM,
-
-  // Add/subtract with overflow/carry.  These have the same operands as
-  // the corresponding standard operations, except with the carry flag
-  // replaced by a condition code value.
-  SADDO, SSUBO, UADDO, USUBO, ADDCARRY, SUBCARRY,
-
-  // Set the condition code from a boolean value in operand 0.
-  // Operand 1 is a mask of all condition-code values that may result of this
-  // operation, operand 2 is a mask of condition-code values that may result
-  // if the boolean is true.
-  // Note that this operation is always optimized away, we will never
-  // generate any code for it.
-  GET_CCMASK,
-
-  // Use a series of MVCs to copy bytes from one memory location to another.
-  // The operands are:
-  // - the target address
-  // - the source address
-  // - the constant length
-  //
-  // This isn't a memory opcode because we'd need to attach two
-  // MachineMemOperands rather than one.
-  MVC,
-
-  // Similar to MVC, but for logic operations (AND, OR, XOR).
-  NC,
-  OC,
-  XC,
-
-  // Use CLC to compare two blocks of memory, with the same comments
-  // as for MVC.
-  CLC,
-
-  // Use MVC to set a block of memory after storing the first byte.
-  MEMSET_MVC,
-
-  // Use an MVST-based sequence to implement stpcpy().
-  STPCPY,
-
-  // Use a CLST-based sequence to implement strcmp().  The two input operands
-  // are the addresses of the strings to compare.
-  STRCMP,
-
-  // Use an SRST-based sequence to search a block of memory.  The first
-  // operand is the end address, the second is the start, and the third
-  // is the character to search for.  CC is set to 1 on success and 2
-  // on failure.
-  SEARCH_STRING,
-
-  // Store the CC value in bits 29 and 28 of an integer.
-  IPM,
-
-  // Transaction begin.  The first operand is the chain, the second
-  // the TDB pointer, and the third the immediate control field.
-  // Returns CC value and chain.
-  TBEGIN,
-  TBEGIN_NOFLOAT,
-
-  // Transaction end.  Just the chain operand.  Returns CC value and chain.
-  TEND,
-
-  // Create a vector constant by filling byte N of the result with bit
-  // 15-N of the single operand.
-  BYTE_MASK,
-
-  // Create a vector constant by replicating an element-sized RISBG-style mask.
-  // The first operand specifies the starting set bit and the second operand
-  // specifies the ending set bit.  Both operands count from the MSB of the
-  // element.
-  ROTATE_MASK,
-
-  // Replicate a GPR scalar value into all elements of a vector.
-  REPLICATE,
-
-  // Create a vector from two i64 GPRs.
-  JOIN_DWORDS,
-
-  // Replicate one element of a vector into all elements.  The first operand
-  // is the vector and the second is the index of the element to replicate.
-  SPLAT,
-
-  // Interleave elements from the high half of operand 0 and the high half
-  // of operand 1.
-  MERGE_HIGH,
-
-  // Likewise for the low halves.
-  MERGE_LOW,
-
-  // Concatenate the vectors in the first two operands, shift them left
-  // by the third operand, and take the first half of the result.
-  SHL_DOUBLE,
-
-  // Take one element of the first v2i64 operand and the one element of
-  // the second v2i64 operand and concatenate them to form a v2i64 result.
-  // The third operand is a 4-bit value of the form 0A0B, where A and B
-  // are the element selectors for the first operand and second operands
-  // respectively.
-  PERMUTE_DWORDS,
-
-  // Perform a general vector permute on vector operands 0 and 1.
-  // Each byte of operand 2 controls the corresponding byte of the result,
-  // in the same way as a byte-level VECTOR_SHUFFLE mask.
-  PERMUTE,
-
-  // Pack vector operands 0 and 1 into a single vector with half-sized elements.
-  PACK,
-
-  // Likewise, but saturate the result and set CC.  PACKS_CC does signed
-  // saturation and PACKLS_CC does unsigned saturation.
-  PACKS_CC,
-  PACKLS_CC,
-
-  // Unpack the first half of vector operand 0 into double-sized elements.
-  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
-  UNPACK_HIGH,
-  UNPACKL_HIGH,
-
-  // Likewise for the second half.
-  UNPACK_LOW,
-  UNPACKL_LOW,
-
-  // Shift/rotate each element of vector operand 0 by the number of bits
-  // specified by scalar operand 1.
-  VSHL_BY_SCALAR,
-  VSRL_BY_SCALAR,
-  VSRA_BY_SCALAR,
-  VROTL_BY_SCALAR,
-
-  // Concatenate the vectors in the first two operands, shift them left/right
-  // bitwise by the third operand, and take the first/last half of the result.
-  SHL_DOUBLE_BIT,
-  SHR_DOUBLE_BIT,
-
-  // For each element of the output type, sum across all sub-elements of
-  // operand 0 belonging to the corresponding element, and add in the
-  // rightmost sub-element of the corresponding element of operand 1.
-  VSUM,
-
-  // Compute carry/borrow indication for add/subtract.
-  VACC, VSCBI,
-  // Add/subtract with carry/borrow.
-  VAC, VSBI,
-  // Compute carry/borrow indication for add/subtract with carry/borrow.
-  VACCC, VSBCBI,
-
-  // High-word multiply-and-add.
-  VMAH, VMALH,
-  // Widen and multiply even/odd vector elements.
-  VME, VMLE, VMO, VMLO,
-
-  // Compare integer vector operands 0 and 1 to produce the usual 0/-1
-  // vector result.  VICMPE is for equality, VICMPH for "signed greater than"
-  // and VICMPHL for "unsigned greater than".
-  VICMPE,
-  VICMPH,
-  VICMPHL,
-
-  // Likewise, but also set the condition codes on the result.
-  VICMPES,
-  VICMPHS,
-  VICMPHLS,
-
-  // Compare floating-point vector operands 0 and 1 to produce the usual 0/-1
-  // vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
-  // greater than" and VFCMPHE for "ordered and greater than or equal to".
-  VFCMPE,
-  VFCMPH,
-  VFCMPHE,
-
-  // Likewise, but also set the condition codes on the result.
-  VFCMPES,
-  VFCMPHS,
-  VFCMPHES,
-
-  // Test floating-point data class for vectors.
-  VFTCI,
-
-  // Extend the even f32 elements of vector operand 0 to produce a vector
-  // of f64 elements.
-  VEXTEND,
-
-  // Round the f64 elements of vector operand 0 to f32s and store them in the
-  // even elements of the result.
-  VROUND,
-
-  // AND the two vector operands together and set CC based on the result.
-  VTM,
-
-  // i128 high integer comparisons.
-  SCMP128HI,
-  UCMP128HI,
-
-  // String operations that set CC as a side-effect.
-  VFAE_CC,
-  VFAEZ_CC,
-  VFEE_CC,
-  VFEEZ_CC,
-  VFENE_CC,
-  VFENEZ_CC,
-  VISTR_CC,
-  VSTRC_CC,
-  VSTRCZ_CC,
-  VSTRS_CC,
-  VSTRSZ_CC,
-
-  // Test Data Class.
-  //
-  // Operand 0: the value to test
-  // Operand 1: the bit mask
-  TDC,
-
-  // z/OS XPLINK ADA Entry
-  // Wraps a TargetGlobalAddress that should be loaded from a function's
-  // AssociatedData Area (ADA). Tha ADA is passed to the function by the
-  // caller in the XPLink ABI defined register R5.
-  // Operand 0: the GlobalValue/External Symbol
-  // Operand 1: the ADA register
-  // Operand 2: the offset (0 for the first and 8 for the second element in the
-  // function descriptor)
-  ADA_ENTRY,
-
-  // Strict variants of scalar floating-point comparisons.
-  // Quiet and signaling versions.
-  FIRST_STRICTFP_OPCODE,
-  STRICT_FCMP = FIRST_STRICTFP_OPCODE,
-  STRICT_FCMPS,
-
-  // Strict variants of vector floating-point comparisons.
-  // Quiet and signaling versions.
-  STRICT_VFCMPE,
-  STRICT_VFCMPH,
-  STRICT_VFCMPHE,
-  STRICT_VFCMPES,
-  STRICT_VFCMPHS,
-  STRICT_VFCMPHES,
-
-  // Strict variants of VEXTEND and VROUND.
-  STRICT_VEXTEND,
-  STRICT_VROUND,
-  LAST_STRICTFP_OPCODE = STRICT_VROUND,
-
-  // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
-  // ATOMIC_LOAD_<op>.
-  //
-  // Operand 0: the address of the containing 32-bit-aligned field
-  // Operand 1: the second operand of <op>, in the high bits of an i32
-  //            for everything except ATOMIC_SWAPW
-  // Operand 2: how many bits to rotate the i32 left to bring the first
-  //            operand into the high bits
-  // Operand 3: the negative of operand 2, for rotating the other way
-  // Operand 4: the width of the field in bits (8 or 16)
-  FIRST_MEMORY_OPCODE,
-  ATOMIC_SWAPW = FIRST_MEMORY_OPCODE,
-  ATOMIC_LOADW_ADD,
-  ATOMIC_LOADW_SUB,
-  ATOMIC_LOADW_AND,
-  ATOMIC_LOADW_OR,
-  ATOMIC_LOADW_XOR,
-  ATOMIC_LOADW_NAND,
-  ATOMIC_LOADW_MIN,
-  ATOMIC_LOADW_MAX,
-  ATOMIC_LOADW_UMIN,
-  ATOMIC_LOADW_UMAX,
-
-  // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
-  //
-  // Operand 0: the address of the containing 32-bit-aligned field
-  // Operand 1: the compare value, in the low bits of an i32
-  // Operand 2: the swap value, in the low bits of an i32
-  // Operand 3: how many bits to rotate the i32 left to bring the first
-  //            operand into the high bits
-  // Operand 4: the negative of operand 2, for rotating the other way
-  // Operand 5: the width of the field in bits (8 or 16)
-  ATOMIC_CMP_SWAPW,
-
-  // Atomic compare-and-swap returning CC value.
-  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-  ATOMIC_CMP_SWAP,
-
-  // 128-bit atomic load.
-  // Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
-  ATOMIC_LOAD_128,
-
-  // 128-bit atomic store.
-  // OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
-  ATOMIC_STORE_128,
-
-  // 128-bit atomic compare-and-swap.
-  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-  ATOMIC_CMP_SWAP_128,
-
-  // Byte swapping load/store.  Same operands as regular load/store.
-  LRV, STRV,
-
-  // Element swapping load/store.  Same operands as regular load/store.
-  VLER, VSTER,
-
-  // Use STORE CLOCK FAST to store current TOD clock value.
-  STCKF,
-
-  // Prefetch from the second operand using the 4-bit control code in
-  // the first operand.  The code is 1 for a load prefetch and 2 for
-  // a store prefetch.
-  PREFETCH,
-  LAST_MEMORY_OPCODE = PREFETCH,
-};
-
-// Return true if OPCODE is some kind of PC-relative address.
-inline bool isPCREL(unsigned Opcode) {
-  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
-}
-} // end namespace SystemZISD
 
 namespace SystemZICMP {
 // Describes whether an integer comparison needs to be signed or unsigned,
@@ -532,8 +148,6 @@ class SystemZTargetLowering : public TargetLowering {
     return true;
   }
 
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
   // This function currently returns cost for srl/ipm/cc sequence for merging.
   CondMergingParams
   getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 547d3dcf92804..a02cafaaafcdf 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -265,74 +265,151 @@ def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
                                   SDNPOutGlue]>;
 def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
 
-// Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+// Return with a glue operand.  Operand 0 is the chain operand.
 def z_retglue           : SDNode<"SystemZISD::RET_GLUE", SDTNone,
                                  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Calls a function.  Operand 0 is the chain operand and operand 1
+// is the target address.  The arguments start at operand 2.
+// There is an optional glue operand at the end.
 def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
 def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+// TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
+// (The call target is implicitly __tls_get_offset.)
 def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                   SDNPVariadic]>;
 def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                   SDNPVariadic]>;
+
+// Wraps a TargetGlobalAddress that should be loaded using PC-relative
+// accesses (LARL).  Operand 0 is the address.
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+
+// Used in cases where an offset is applied to a TargetGlobalAddress.
+// Operand 0 is the full TargetGlobalAddress and operand 1 is a
+// PCREL_WRAPPER for an anchor point.  This is used so that we can
+// cheaply refer to either the full address or the anchor point
+// as a register base.
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
+
+// Integer comparisons.  There are three operands: the two values
+// to compare, and an integer of type SystemZICMP.
 def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
+
+// Floating-point comparisons.  The two operands are the values to compare.
 def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
-def z_strict_fcmp       : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
-                                 [SDNPHasChain]>;
-def z_strict_fcmps      : SDNode<"SystemZISD::STRICT_FCMPS", SDT_ZCmp,
-                                 [SDNPHasChain]>;
+
+let IsStrictFP = true in {
+  // Strict variants of scalar floating-point comparisons.
+  // Quiet and signaling versions.
+  def z_strict_fcmp : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
+                             [SDNPHasChain]>;
+  def z_strict_fcmps : SDNode<"SystemZISD::STRICT_FCMPS", SDT_ZCmp,
+                              [SDNPHasChain]>;
+}
+
+// Test under mask.  The first operand is ANDed with the second operand
+// and the condition codes are set on the result.  The third operand is
+// a boolean that is true if the condition codes need to distinguish
+// between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+// register forms do but the memory forms don't).
 def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp>;
+
+// Branches if a condition is true.  Operand 0 is the chain operand;
+// operand 1 is the 4-bit condition-code mask, with bit N in
+// big-endian order meaning "branch if CC=N"; operand 2 is the
+// target block and operand 3 is the flag operand.
 def z_br_ccmask_1       : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
                                  [SDNPHasChain]>;
+
+// Selects between operand 0 and operand 1.  Operand 2 is the
+// mask of condition-code values for which operand 0 should be
+// chosen over operand 1; it has the same form as BR_CCMASK.
+// Operand 3 is the flag operand.
 def z_select_ccmask_1   : SDNode<"SystemZISD::SELECT_CCMASK",
                                  SDT_ZSelectCCMask>;
+
+// Store the CC value in bits 29 and 28 of an integer.
 def z_ipm_1             : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
+
+// Evaluates to the gap between the stack pointer and the
+// base of the dynamically-allocatable area.
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+
+// For allocating stack space when using stack clash protector.
+// Allocation is performed by block, and each block is probed.
 def z_probed_alloca     : SDNode<"SystemZISD::PROBED_ALLOCA", SDT_ZProbedAlloca,
                                  [SDNPHasChain]>;
+
+// Count number of bits set in operand 0 per byte.
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+
+// Wrappers around the ISD opcodes of the same name.  The output is GR128.
+// Input operands may be GR64 or GR32, depending on the instruction.
 def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
+
+// Add/subtract with overflow/carry.  These have the same operands as
+// the corresponding standard operations, except with the carry flag
+// replaced by a condition code value.
 def z_saddo             : SDNode<"SystemZISD::SADDO", SDT_ZBinaryWithFlags>;
 def z_ssubo             : SDNode<"SystemZISD::SSUBO", SDT_ZBinaryWithFlags>;
 def z_uaddo             : SDNode<"SystemZISD::UADDO", SDT_ZBinaryWithFlags>;
 def z_usubo             : SDNode<"SystemZISD::USUBO", SDT_ZBinaryWithFlags>;
 def z_addcarry_1        : SDNode<"SystemZISD::ADDCARRY", SDT_ZBinaryWithCarry>;
 def z_subcarry_1        : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
+
+// Compute carry/borrow indication for add/subtract.
 def z_vacc              : SDNode<"SystemZISD::VACC", SDTIntBinOp>;
-def z_vac               : SDNode<"SystemZISD::VAC", SDT_ZTernary>;
-def z_vaccc             : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
 def z_vscbi             : SDNode<"SystemZISD::VSCBI", SDTIntBinOp>;
+
+// Add/subtract with carry/borrow.
+def z_vac               : SDNode<"SystemZISD::VAC", SDT_ZTernary>;
 def z_vsbi              : SDNode<"SystemZISD::VSBI", SDT_ZTernary>;
+
+// Compute carry/borrow indication for add/subtract with carry/borrow.
+def z_vaccc             : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
 def z_vsbcbi            : SDNode<"SystemZISD::VSBCBI", SDT_ZTernary>;
+
+// High-word multiply-and-add.
 def z_vmah              : SDNode<"SystemZISD::VMAH", SDT_ZTernary>;
 def z_vmalh             : SDNode<"SystemZISD::VMALH", SDT_ZTernary>;
+
+// Widen and multiply even/odd vector elements.
 def z_vme               : SDNode<"SystemZISD::VME", SDT_ZBinaryConv>;
 def z_vmle              : SDNode<"SystemZISD::VMLE", SDT_ZBinaryConv>;
 def z_vmo               : SDNode<"SystemZISD::VMO", SDT_ZBinaryConv>;
 def z_vmlo              : SDNode<"SystemZISD::VMLO", SDT_ZBinaryConv>;
 
+// Byte swapping load/store.  Same operands as regular load/store.
 def z_loadbswap        : SDNode<"SystemZISD::LRV", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storebswap       : SDNode<"SystemZISD::STRV", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Element swapping load/store.  Same operands as regular load/store.
 def z_loadeswap        : SDNode<"SystemZISD::VLER", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storeeswap       : SDNode<"SystemZISD::VSTER", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Use STORE CLOCK FAST to store current TOD clock value.
 def z_stckf            : SDNode<"SystemZISD::STCKF", SDT_ZStoreInherent,
                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+// Test Data Class.
+//
+// Operand 0: the value to test
+// Operand 1: the bit mask
 def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
 
 def z_eh_sjlj_setjmp    : SDNode<"ISD::EH_SJLJ_SETJMP", SDT_ZSetJmp,
@@ -346,26 +423,75 @@ def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
                                  SDT_ZInsertVectorElt>;
 def z_vector_extract    : SDNode<"ISD::EXTRACT_VECTOR_ELT",
                                  SDT_ZExtractVectorElt>;
+
+// Create a vector constant by filling byte N of the result with bit
+// 15-N of the single operand.
 def z_byte_mask         : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>;
+
+// Create a vector constant by replicating an element-sized RISBG-style mask.
+// The first operand specifies the starting set bit and the second operand
+// specifies the ending set bit.  Both operands count from the MSB of the
+// element.
 def z_rotate_mask       : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>;
+
+// Replicate a GPR scalar value into all elements of a vector.
 def z_replicate         : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>;
+
+// Create a vector from two i64 GPRs.
 def z_join_dwords       : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>;
+
+// Replicate one element of a vector into all elements.  The first operand
+// is the vector and the second is the index of the element to replicate.
 def z_splat             : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
+
+// Interleave elements from the high half of operand 0 and the high half
+// of operand 1.
 def z_merge_high        : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
+
+// Likewise for the low halves.
 def z_merge_low         : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
+
+// Concatenate the vectors in the first two operands, shift them left
+// by the third operand, and take the first half of the result.
 def z_shl_double        : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
+
+// Concatenate the vectors in the first two operands, shift them left/right
+// bitwise by the third operand, and take the first/last half of the result.
 def z_shl_double_bit    : SDNode<"SystemZISD::SHL_DOUBLE_BIT", SDT_ZVecTernaryInt>;
 def z_shr_double_bit    : SDNode<"SystemZISD::SHR_DOUBLE_BIT", SDT_ZVecTernaryInt>;
+
+// Take one element of the first v2i64 operand and the one element of
+// the second v2i64 operand and concatenate them to form a v2i64 result.
+// The third operand is a 4-bit value of the form 0A0B, where A and B
+// are the element selectors for the first operand and second operands
+// respectively.
 def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
                                  SDT_ZVecTernaryInt>;
+
+// Perform a general vector permute on vector operands 0 and 1.
+// Each byte of operand 2 controls the corresponding byte of the result,
+// in the same way as a byte-level VECTOR_SHUFFLE mask.
 def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
+
+// Pack vector operands 0 and 1 into a single vector with half-sized elements.
 def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
+
+// Likewise, but saturate the result and set CC.  PACKS_CC does signed
+// saturation and PACKLS_CC does unsigned saturation.
 def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConvCC>;
 def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConvCC>;
+
+// Unpack the first half of vector operand 0 into double-sized elements.
+// UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
 def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnpack>;
 def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnpack>;
+
+// Likewise for the second half.
 def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnpack>;
 def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnpack>;
+
+// Shift/rotate each element of vector operand 0 by the number of bits
+// specified by scalar operand 1.
 def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@@ -374,40 +500,75 @@ def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vrotl_by_scalar   : SDNode<"SystemZISD::VROTL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
+
+// For each element of the output type, sum across all sub-elements of
+// operand 0 belonging to the corresponding element, and add in the
+// rightmost sub-element of the corresponding element of operand 1.
 def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZBinaryConv>;
+
+// Compare integer vector operands 0 and 1 to produce the usual 0/-1
+// vector result.  VICMPE is for equality, VICMPH for "signed greater than"
+// and VICMPHL for "unsigned greater than".
 def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecCompare>;
 def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecCompare>;
 def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecCompare>;
+
+// Likewise, but also set the condition codes on the result.
 def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecCompareCC>;
 def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecCompareCC>;
 def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecCompareCC>;
+
+// Compare floating-point vector operands 0 and 1 to produce the usual 0/-1
+// vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
+// greater than" and VFCMPHE for "ordered and greater than or equal to".
 def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
-def z_strict_vfcmpe     : SDNode<"SystemZISD::STRICT_VFCMPE",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmpes    : SDNode<"SystemZISD::STRICT_VFCMPES",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
 def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
-def z_strict_vfcmph     : SDNode<"SystemZISD::STRICT_VFCMPH",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmphs    : SDNode<"SystemZISD::STRICT_VFCMPHS",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
 def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
-def z_strict_vfcmphe    : SDNode<"SystemZISD::STRICT_VFCMPHE",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmphes   : SDNode<"SystemZISD::STRICT_VFCMPHES",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+
+// Likewise, but also set the condition codes on the result.
 def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConvCC>;
 def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConvCC>;
 def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConvCC>;
+
+// Extend the even f32 elements of vector operand 0 to produce a vector
+// of f64 elements.
 def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
-def z_strict_vextend    : SDNode<"SystemZISD::STRICT_VEXTEND",
-                                 SDT_ZVecUnaryConv, [SDNPHasChain]>;
+
+// Round the f64 elements of vector operand 0 to f32s and store them in the
+// even elements of the result.
 def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
-def z_strict_vround     : SDNode<"SystemZISD::STRICT_VROUND",
+
+let IsStrictFP = true in {
+  // Strict variants of vector floating-point comparisons.
+  // Quiet and signaling versions.
+  def z_strict_vfcmpe   : SDNode<"SystemZISD::STRICT_VFCMPE",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmph   : SDNode<"SystemZISD::STRICT_VFCMPH",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphe  : SDNode<"SystemZISD::STRICT_VFCMPHE",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmpes  : SDNode<"SystemZISD::STRICT_VFCMPES",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphs  : SDNode<"SystemZISD::STRICT_VFCMPHS",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphes : SDNode<"SystemZISD::STRICT_VFCMPHES",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+
+  // Strict variants of VEXTEND and VROUND.
+  def z_strict_vextend  : SDNode<"SystemZISD::STRICT_VEXTEND",
+                                 SDT_ZVecUnaryConv, [SDNPHasChain]>;
+  def z_strict_vround   : SDNode<"SystemZISD::STRICT_VROUND",
                                  SDT_ZVecUnaryConv, [SDNPHasChain]>;
+}
+
+// AND the two vector operands together and set CC based on the result.
 def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp>;
+
+// i128 high integer comparisons.
 def z_scmp128hi         : SDNode<"SystemZISD::SCMP128HI", SDT_ZCmp>;
 def z_ucmp128hi         : SDNode<"SystemZISD::UCMP128HI", SDT_ZCmp>;
+
+// String operations that set CC as a side-effect.
 def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryIntCC>;
 def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryIntCC>;
 def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinaryCC>;
@@ -423,12 +584,24 @@ def z_vstrs_cc          : SDNode<"SystemZISD::VSTRS_CC",
                                  SDT_ZVecTernaryConvCC>;
 def z_vstrsz_cc         : SDNode<"SystemZISD::VSTRSZ_CC",
                                  SDT_ZVecTernaryConvCC>;
+
+// Test floating-point data class for vectors.
 def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
 
 class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
   : SDNode<"SystemZISD::"#name, profile,
            [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
+// Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+// ATOMIC_LOAD_<op>.
+//
+// Operand 0: the address of the containing 32-bit-aligned field
+// Operand 1: the second operand of <op>, in the high bits of an i32
+//            for everything except ATOMIC_SWAPW
+// Operand 2: how many bits to rotate the i32 left to bring the first
+//            operand into the high bits
+// Operand 3: the negative of operand 2, for rotating the other way
+// Operand 4: the width of the field in bits (8 or 16)
 def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
 def z_atomic_loadw_add  : AtomicWOp<"ATOMIC_LOADW_ADD">;
 def z_atomic_loadw_sub  : AtomicWOp<"ATOMIC_LOADW_SUB">;
@@ -441,55 +614,117 @@ def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
 def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 
+// Atomic compare-and-swap returning CC value.
+// Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
 def z_atomic_cmp_swap   : SDNode<"SystemZISD::ATOMIC_CMP_SWAP",
                                  SDT_ZAtomicCmpSwap,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                   SDNPMemOperand]>;
+
+// A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+//
+// Operand 0: the address of the containing 32-bit-aligned field
+// Operand 1: the compare value, in the low bits of an i32
+// Operand 2: the swap value, in the low bits of an i32
+// Operand 3: how many bits to rotate the i32 left to bring the first
+//            operand into the high bits
+// Operand 4: the negative of operand 2, for rotating the other way
+// Operand 5: the width of the field in bits (8 or 16)
 def z_atomic_cmp_swapw  : SDNode<"SystemZISD::ATOMIC_CMP_SWAPW",
                                  SDT_ZAtomicCmpSwapW,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                   SDNPMemOperand]>;
 
+// 128-bit atomic load.
+// Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
 def z_atomic_load_128   : SDNode<"SystemZISD::ATOMIC_LOAD_128",
                                  SDT_ZAtomicLoad128,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// 128-bit atomic store.
+// OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
 def z_atomic_store_128  : SDNode<"SystemZISD::ATOMIC_STORE_128",
                                  SDT_ZAtomicStore128,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// 128-bit atomic compare-and-swap.
+// Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
 def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
                                    SDT_ZAtomicCmpSwap128,
                                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                     SDNPMemOperand]>;
 
+// Use a series of MVCs to copy bytes from one memory location to another.
+// The operands are:
+// - the target address
+// - the source address
+// - the constant length
+//
+// This isn't a memory opcode because we'd need to attach two
+// MachineMemOperands rather than one.
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Similar to MVC, but for logic operations (AND, OR, XOR).
 def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use CLC to compare two blocks of memory, with the same comments
+// as for MVC.
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Use MVC to set a block of memory after storing the first byte.
 def z_memset_mvc        : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use a CLST-based sequence to implement strcmp().  The two input operands
+// are the addresses of the strings to compare.
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Use an MVST-based sequence to implement stpcpy().
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use an SRST-based sequence to search a block of memory.  The first
+// operand is the end address, the second is the start, and the third
+// is the character to search for.  CC is set to 1 on success and 2
+// on failure.
 def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Prefetch from the second operand using the 4-bit control code in
+// the first operand.  The code is 1 for a load prefetch and 2 for
+// a store prefetch.
 def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
                                   SDNPMemOperand]>;
 
+// Transaction begin.  The first operand is the chain, the second
+// the TDB pointer, and the third the immediate control field.
+// Returns CC value and chain.
 def z_tbegin            : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
                                  [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
 def z_tbegin_nofloat    : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
                                  [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+// Transaction end.  Just the chain operand.  Returns CC value and chain.
 def z_tend              : SDNode<"SystemZISD::TEND", SDT_ZTEnd,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
+// z/OS XPLINK ADA Entry
+// Wraps a TargetGlobalAddress that should be loaded from a function's
+// AssociatedData Area (ADA). Tha ADA is passed to the function by the
+// caller in the XPLink ABI defined register R5.
+// Operand 0: the GlobalValue/External Symbol
+// Operand 1: the ADA register
+// Operand 2: the offset (0 for the first and 8 for the second element in the
+// function descriptor)
 def z_ada_entry         : SDNode<"SystemZISD::ADA_ENTRY",
                                   SDT_ZADAENTRY>;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index eb00d484af693..88feba8adce0e 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -10,21 +10,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZSelectionDAGInfo.h"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 
+#define GET_SDNODE_DESC
+#include "SystemZGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-bool SystemZSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= SystemZISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= SystemZISD::LAST_MEMORY_OPCODE;
-}
+SystemZSelectionDAGInfo::SystemZSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(SystemZGenSDNodeInfo) {}
+
+const char *SystemZSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<SystemZISD::NodeType>(Opcode)) {
+  case SystemZISD::GET_CCMASK:
+    return "SystemZISD::GET_CCMASK";
+  }
 
-bool SystemZSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
-  return Opcode >= SystemZISD::FIRST_STRICTFP_OPCODE &&
-         Opcode <= SystemZISD::LAST_STRICTFP_OPCODE;
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
 static unsigned getMemMemLenAdj(unsigned Op) {
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 200566f9646c1..d25fddab65161 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -15,15 +15,34 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "SystemZGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace SystemZISD {
+
+enum NodeType : unsigned {
+  // Set the condition code from a boolean value in operand 0.
+  // Operand 1 is a mask of all condition-code values that may result of this
+  // operation, operand 2 is a mask of condition-code values that may result
+  // if the boolean is true.
+  // Note that this operation is always optimized away, we will never
+  // generate any code for it.
+  GET_CCMASK = GENERATED_OPCODE_END,
+};
 
-class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
-public:
-  explicit SystemZSelectionDAGInfo() = default;
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+}
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+} // namespace SystemZISD
+
+class SystemZSelectionDAGInfo : public SelectionDAGGenTargetInfo {
+public:
+  SystemZSelectionDAGInfo();
 
-  bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Src,
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..434a6d2c3553f 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -31,7 +31,6 @@ set(sources
   X86CmovConversion.cpp
   X86CodeGenPassBuilder.cpp
   X86DomainReassignment.cpp
-  X86DiscriminateMemOps.cpp
   X86LowerTileCopy.cpp
   X86LowerAMXType.cpp
   X86LowerAMXIntrinsics.cpp
@@ -57,7 +56,6 @@ set(sources
   X86IndirectBranchTracking.cpp
   X86IndirectThunks.cpp
   X86InterleavedAccess.cpp
-  X86InsertPrefetch.cpp
   X86InstCombineIntrinsic.cpp
   X86InstrFMA3Info.cpp
   X86InstrFoldTables.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 03706aaaab237..97848bec7127e 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -166,13 +166,6 @@ FunctionPass *createX86IndirectThunksPass();
 /// This pass replaces ret instructions with jmp's to __x86_return thunk.
 FunctionPass *createX86ReturnThunksPass();
 
-/// This pass ensures instructions featuring a memory operand
-/// have distinctive <LineNumber, Discriminator> (with respect to each other)
-FunctionPass *createX86DiscriminateMemOpsPass();
-
-/// This pass applies profiling information to insert cache prefetches.
-FunctionPass *createX86InsertPrefetchPass();
-
 /// This pass insert wait instruction after X87 instructions which could raise
 /// fp exceptions when strict-fp enabled.
 FunctionPass *createX86InsertX87waitPass();
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
deleted file mode 100644
index bd151a450394a..0000000000000
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// This pass aids profile-driven cache prefetch insertion by ensuring all
-/// instructions that have a memory operand are distinguishible from each other.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/Debug.h"
-#include <optional>
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-discriminate-memops"
-
-static cl::opt<bool> EnableDiscriminateMemops(
-    DEBUG_TYPE, cl::init(false),
-    cl::desc("Generate unique debug info for each instruction with a memory "
-             "operand. Should be enabled for profile-driven cache prefetching, "
-             "both in the build of the binary being profiled, as well as in "
-             "the build of the binary consuming the profile."),
-    cl::Hidden);
-
-static cl::opt<bool> BypassPrefetchInstructions(
-    "x86-bypass-prefetch-instructions", cl::init(true),
-    cl::desc("When discriminating instructions with memory operands, ignore "
-             "prefetch instructions. This ensures the other memory operand "
-             "instructions have the same identifiers after inserting "
-             "prefetches, allowing for successive insertions."),
-    cl::Hidden);
-
-namespace {
-
-using Location = std::pair<StringRef, unsigned>;
-
-Location diToLocation(const DILocation *Loc) {
-  return std::make_pair(Loc->getFilename(), Loc->getLine());
-}
-
-/// Ensure each instruction having a memory operand has a distinct <LineNumber,
-/// Discriminator> pair.
-void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) {
-  DebugLoc DL(Loc);
-  MI->setDebugLoc(DL);
-}
-
-class X86DiscriminateMemOps : public MachineFunctionPass {
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  StringRef getPassName() const override {
-    return "X86 Discriminate Memory Operands";
-  }
-
-public:
-  static char ID;
-
-  /// Default construct and initialize the pass.
-  X86DiscriminateMemOps();
-};
-
-bool IsPrefetchOpcode(unsigned Opcode) {
-  return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
-         Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2 ||
-         Opcode == X86::PREFETCHIT0 || Opcode == X86::PREFETCHIT1 ||
-         Opcode == X86::PREFETCHRST2;
-}
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char X86DiscriminateMemOps::ID = 0;
-
-/// Default construct and initialize the pass.
-X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
-
-bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
-  if (!EnableDiscriminateMemops)
-    return false;
-
-  DISubprogram *FDI = MF.getFunction().getSubprogram();
-  if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
-    return false;
-
-  // Have a default DILocation, if we find instructions with memops that don't
-  // have any debug info.
-  const DILocation *ReferenceDI =
-      DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
-  assert(ReferenceDI && "ReferenceDI should not be nullptr");
-  DenseMap<Location, unsigned> MemOpDiscriminators;
-  MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
-
-  // Figure out the largest discriminator issued for each Location. When we
-  // issue new discriminators, we can thus avoid issuing discriminators
-  // belonging to instructions that don't have memops. This isn't a requirement
-  // for the goals of this pass, however, it avoids unnecessary ambiguity.
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      const auto &DI = MI.getDebugLoc();
-      if (!DI)
-        continue;
-      if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
-        continue;
-      Location Loc = diToLocation(DI);
-      unsigned &Disc = MemOpDiscriminators[Loc];
-      Disc = std::max(Disc, DI->getBaseDiscriminator());
-    }
-  }
-
-  // Keep track of the discriminators seen at each Location. If an instruction's
-  // DebugInfo has a Location and discriminator we've already seen, replace its
-  // discriminator with a new one, to guarantee uniqueness.
-  DenseMap<Location, DenseSet<unsigned>> Seen;
-
-  bool Changed = false;
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
-        continue;
-      if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
-        continue;
-      const DILocation *DI = MI.getDebugLoc();
-      bool HasDebug = DI;
-      if (!HasDebug) {
-        DI = ReferenceDI;
-      }
-      Location L = diToLocation(DI);
-      DenseSet<unsigned> &Set = Seen[L];
-      const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
-          Set.insert(DI->getBaseDiscriminator());
-      if (!TryInsert.second || !HasDebug) {
-        unsigned BF, DF, CI = 0;
-        DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
-        std::optional<unsigned> EncodedDiscriminator =
-            DILocation::encodeDiscriminator(MemOpDiscriminators[L] + 1, DF, CI);
-
-        if (!EncodedDiscriminator) {
-          // FIXME(mtrofin): The assumption is that this scenario is infrequent/OK
-          // not to support. If evidence points otherwise, we can explore synthesizeing
-          // unique DIs by adding fake line numbers, or by constructing 64 bit
-          // discriminators.
-          LLVM_DEBUG(dbgs() << "Unable to create a unique discriminator "
-                     "for instruction with memory operand in: "
-                     << DI->getFilename() << " Line: " << DI->getLine()
-                     << " Column: " << DI->getColumn()
-                     << ". This is likely due to a large macro expansion. \n");
-          continue;
-        }
-        // Since we were able to encode, bump the MemOpDiscriminators.
-        ++MemOpDiscriminators[L];
-        DI = DI->cloneWithDiscriminator(*EncodedDiscriminator);
-        assert(DI && "DI should not be nullptr");
-        updateDebugInfo(&MI, DI);
-        Changed = true;
-        std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
-            Set.insert(DI->getBaseDiscriminator());
-        (void)MustInsert; // Silence warning in release build.
-        assert(MustInsert.second && "New discriminator shouldn't be present in set");
-      }
-
-      // Bump the reference DI to avoid cramming discriminators on line 0.
-      // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI
-      // in a block. It's more consistent than just relying on the last memop
-      // instruction we happened to see.
-      ReferenceDI = DI;
-    }
-  }
-  return Changed;
-}
-
-FunctionPass *llvm::createX86DiscriminateMemOpsPass() {
-  return new X86DiscriminateMemOps();
-}
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
deleted file mode 100644
index 953b755a0ca4c..0000000000000
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass applies cache prefetch instructions based on a profile. The pass
-// assumes DiscriminateMemOps ran immediately before, to ensure debug info
-// matches the one used at profile generation time. The profile is encoded in
-// afdo format (text or binary). It contains prefetch hints recommendations.
-// Each recommendation is made in terms of debug info locations, a type (i.e.
-// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a
-// memory operand (see X86DiscriminateMemOps). The prefetch will be made for
-// a location at that memory operand + the delta specified in the
-// recommendation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Transforms/IPO/SampleProfile.h"
-using namespace llvm;
-using namespace sampleprof;
-
-static cl::opt<std::string>
-    PrefetchHintsFile("prefetch-hints-file",
-                      cl::desc("Path to the prefetch hints profile. See also "
-                               "-x86-discriminate-memops"),
-                      cl::Hidden);
-namespace {
-
-class X86InsertPrefetch : public MachineFunctionPass {
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool doInitialization(Module &) override;
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  struct PrefetchInfo {
-    unsigned InstructionID;
-    int64_t Delta;
-  };
-  typedef SmallVectorImpl<PrefetchInfo> Prefetches;
-  bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI,
-                        Prefetches &prefetches) const;
-
-public:
-  static char ID;
-  X86InsertPrefetch(const std::string &PrefetchHintsFilename);
-  StringRef getPassName() const override {
-    return "X86 Insert Cache Prefetches";
-  }
-
-private:
-  std::string Filename;
-  std::unique_ptr<SampleProfileReader> Reader;
-};
-
-using PrefetchHints = SampleRecord::CallTargetMap;
-
-// Return any prefetching hints for the specified MachineInstruction. The hints
-// are returned as pairs (name, delta).
-ErrorOr<const PrefetchHints &>
-getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) {
-  if (const auto &Loc = MI.getDebugLoc())
-    if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
-      return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
-                                          Loc->getBaseDiscriminator());
-  return std::error_code();
-}
-
-// The prefetch instruction can't take memory operands involving vector
-// registers.
-bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
-  Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
-  Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
-  return (BaseReg == 0 ||
-          X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
-          X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
-         (IndexReg == 0 ||
-          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
-          X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg));
-}
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char X86InsertPrefetch::ID = 0;
-
-X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename)
-    : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {}
-
-/// Return true if the provided MachineInstruction has cache prefetch hints. In
-/// that case, the prefetch hints are stored, in order, in the Prefetches
-/// vector.
-bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
-                                         const MachineInstr &MI,
-                                         Prefetches &Prefetches) const {
-  assert(Prefetches.empty() &&
-         "Expected caller passed empty PrefetchInfo vector.");
-
-  // There is no point to match prefetch hints if the profile is using MD5.
-  if (FunctionSamples::UseMD5)
-    return false;
-
-  static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
-      {"_nta_", X86::PREFETCHNTA},
-      {"_t0_", X86::PREFETCHT0},
-      {"_t1_", X86::PREFETCHT1},
-      {"_t2_", X86::PREFETCHT2},
-  };
-  static const char *SerializedPrefetchPrefix = "__prefetch";
-
-  auto T = getPrefetchHints(TopSamples, MI);
-  if (!T)
-    return false;
-  int16_t max_index = -1;
-  // Convert serialized prefetch hints into PrefetchInfo objects, and populate
-  // the Prefetches vector.
-  for (const auto &S_V : *T) {
-    StringRef Name = S_V.first.stringRef();
-    if (Name.consume_front(SerializedPrefetchPrefix)) {
-      int64_t D = static_cast<int64_t>(S_V.second);
-      unsigned IID = 0;
-      for (const auto &HintType : HintTypes) {
-        if (Name.consume_front(HintType.first)) {
-          IID = HintType.second;
-          break;
-        }
-      }
-      if (IID == 0)
-        return false;
-      uint8_t index = 0;
-      Name.consumeInteger(10, index);
-
-      if (index >= Prefetches.size())
-        Prefetches.resize(index + 1);
-      Prefetches[index] = {IID, D};
-      max_index = std::max(max_index, static_cast<int16_t>(index));
-    }
-  }
-  assert(max_index + 1 >= 0 &&
-         "Possible overflow: max_index + 1 should be positive.");
-  assert(static_cast<size_t>(max_index + 1) == Prefetches.size() &&
-         "The number of prefetch hints received should match the number of "
-         "PrefetchInfo objects returned");
-  return !Prefetches.empty();
-}
-
-bool X86InsertPrefetch::doInitialization(Module &M) {
-  if (Filename.empty())
-    return false;
-
-  LLVMContext &Ctx = M.getContext();
-  // TODO: Propagate virtual file system into LLVM targets.
-  auto FS = vfs::getRealFileSystem();
-  ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr =
-      SampleProfileReader::create(Filename, Ctx, *FS);
-  if (std::error_code EC = ReaderOrErr.getError()) {
-    std::string Msg = "Could not open profile: " + EC.message();
-    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg,
-                                             DiagnosticSeverity::DS_Warning));
-    return false;
-  }
-  Reader = std::move(ReaderOrErr.get());
-  Reader->read();
-  return true;
-}
-
-void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
-  if (!Reader)
-    return false;
-  const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction());
-  if (!Samples)
-    return false;
-
-  bool Changed = false;
-
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  SmallVector<PrefetchInfo, 4> Prefetches;
-  for (auto &MBB : MF) {
-    for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) {
-      auto Current = MI;
-      ++MI;
-
-      int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags);
-      if (Offset < 0)
-        continue;
-      unsigned Bias = X86II::getOperandBias(Current->getDesc());
-      int MemOpOffset = Offset + Bias;
-      // FIXME(mtrofin): ORE message when the recommendation cannot be taken.
-      if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset))
-        continue;
-      Prefetches.clear();
-      if (!findPrefetchInfo(Samples, *Current, Prefetches))
-        continue;
-      assert(!Prefetches.empty() &&
-             "The Prefetches vector should contain at least a value if "
-             "findPrefetchInfo returned true.");
-      for (auto &PrefInfo : Prefetches) {
-        unsigned PFetchInstrID = PrefInfo.InstructionID;
-        int64_t Delta = PrefInfo.Delta;
-        const MCInstrDesc &Desc = TII->get(PFetchInstrID);
-        MachineInstr *PFetch =
-            MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
-        MachineInstrBuilder MIB(MF, PFetch);
-
-        static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
-                          X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
-                          X86::AddrSegmentReg == 4,
-                      "Unexpected change in X86 operand offset order.");
-
-        // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
-        // FIXME(mtrofin): consider adding a:
-        //     MachineInstrBuilder::set(unsigned offset, op).
-        MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg())
-            .addImm(
-                Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm())
-            .addReg(
-                Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg())
-            .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() +
-                    Delta)
-            .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg)
-                        .getReg());
-
-        if (!Current->memoperands_empty()) {
-          MachineMemOperand *CurrentOp = *(Current->memoperands_begin());
-          MIB.addMemOperand(MF.getMachineMemOperand(
-              CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize()));
-        }
-
-        // Insert before Current. This is because Current may clobber some of
-        // the registers used to describe the input memory operand.
-        MBB.insert(Current, PFetch);
-        Changed = true;
-      }
-    }
-  }
-  return Changed;
-}
-
-FunctionPass *llvm::createX86InsertPrefetchPass() {
-  return new X86InsertPrefetch(PrefetchHintsFile);
-}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 543220b2fd3b9..713df63479987 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -563,8 +563,6 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86FixupVectorConstants());
   }
   addPass(createX86CompressEVEXPass());
-  addPass(createX86DiscriminateMemOpsPass());
-  addPass(createX86InsertPrefetchPass());
   addPass(createX86InsertX87waitPass());
 }
 
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index d35ae4730a9f3..0f4bc649df720 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments,
 STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
 STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
 STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
+STATISTIC(NumImportantContextIds, "Number of important context ids");
+STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
+STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
+STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+cl::opt<unsigned> MemProfTopNImportant(
+    "memprof-top-n-important", cl::init(10), cl::Hidden,
+    cl::desc("Number of largest cold contexts to consider important"));
+
+cl::opt<bool> MemProfFixupImportant(
+    "memprof-fixup-important", cl::init(true), cl::Hidden,
+    cl::desc("Enables edge fixup for important contexts"));
+
 } // namespace llvm
 
 namespace {
+
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
 /// The graph represents the call contexts in all memprof metadata on allocation
@@ -581,17 +594,26 @@ class CallsiteContextGraph {
 
   /// Adds nodes for the given MIB stack ids.
   template <class NodeT, class IteratorT>
-  void addStackNodesForMIB(ContextNode *AllocNode,
-                           CallStack<NodeT, IteratorT> &StackContext,
-                           CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType,
-                           ArrayRef<ContextTotalSize> ContextSizeInfo);
+  void addStackNodesForMIB(
+      ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+      CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+      ArrayRef<ContextTotalSize> ContextSizeInfo,
+      std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
   /// inlining performed on those callsite instructions.
   void updateStackNodes();
 
+  /// Optionally fixup edges for the N largest cold contexts to better enable
+  /// cloning. This is particularly helpful if the context includes recursion
+  /// as well as inlining, resulting in a single stack node for multiple stack
+  /// ids in the context. With recursion it is particularly difficult to get the
+  /// edge updates correct as in the general case we have lost the original
+  /// stack id ordering for the context. Do more expensive fixup for the largest
+  /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
+  void fixupImportantContexts();
+
   /// Update graph to conservatively handle any callsite stack nodes that target
   /// multiple different callee target functions.
   void handleCallsitesWithMultipleTargets();
@@ -658,7 +680,8 @@ class CallsiteContextGraph {
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
-      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+      const DenseSet<uint32_t> &ImportantContextIds);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -859,6 +882,50 @@ class CallsiteContextGraph {
   /// nodes.
   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
 
+  /// Saves information for the contexts identified as important (the largest
+  /// cold contexts up to MemProfTopNImportant).
+  struct ImportantContextInfo {
+    // The original list of leaf first stack ids corresponding to this context.
+    std::vector<uint64_t> StackIds;
+    // Max length of stack ids corresponding to a single stack ContextNode for
+    // this context (i.e. the max length of a key in StackIdsToNode below).
+    unsigned MaxLength = 0;
+    // Mapping of slices of the stack ids to the corresponding ContextNode
+    // (there can be multiple stack ids due to inlining). Populated when
+    // updating stack nodes while matching them to the IR or summary.
+    std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
+  };
+
+  // Map of important full context ids to information about each.
+  DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
+
+  // For each important context id found in Node (if any), records the list of
+  // stack ids that corresponded to the given callsite Node. There can be more
+  // than one in the case of inlining.
+  void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
+                       // We pass in the Node's context ids to avoid the
+                       // overhead of computing them as the caller already has
+                       // them in some cases.
+                       const DenseSet<uint32_t> &NodeContextIds,
+                       const DenseSet<uint32_t> &ImportantContextIds) {
+    if (!MemProfTopNImportant) {
+      assert(ImportantContextIds.empty());
+      return;
+    }
+    DenseSet<uint32_t> Ids =
+        set_intersection(NodeContextIds, ImportantContextIds);
+    if (Ids.empty())
+      return;
+    auto Size = StackIds.size();
+    for (auto Id : Ids) {
+      auto &Entry = ImportantContextIdInfo[Id];
+      Entry.StackIdsToNode[StackIds] = Node;
+      // Keep track of the max to simplify later analysis.
+      if (Size > Entry.MaxLength)
+        Entry.MaxLength = Size;
+    }
+  }
+
   /// Maps to track the calls to their corresponding nodes in the graph.
   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
@@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    ArrayRef<ContextTotalSize> ContextSizeInfo) {
+    ArrayRef<ContextTotalSize> ContextSizeInfo,
+    std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1361,8 +1429,33 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
+  bool IsImportant = false;
   if (!ContextSizeInfo.empty()) {
     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    // If this is a cold allocation, and we are collecting non-zero largest
+    // contexts, see if this is a candidate.
+    if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
+      uint64_t TotalCold = 0;
+      for (auto &CSI : ContextSizeInfo)
+        TotalCold += CSI.TotalSize;
+      // Record this context if either we haven't found the first top-n largest
+      // yet, or if it is larger than the smallest already recorded.
+      if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
+          // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
+          // sorted in ascending size of its key which is the size.
+          TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
+        if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
+          // Remove old one and its associated entries.
+          auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
+          TotalSizeToContextIdTopNCold.erase(
+              TotalSizeToContextIdTopNCold.begin());
+          assert(ImportantContextIdInfo.count(IdToRemove));
+          ImportantContextIdInfo.erase(IdToRemove);
+        }
+        TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
+        IsImportant = true;
+      }
+    }
     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
@@ -1381,6 +1474,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
        ContextIter != StackContext.end(); ++ContextIter) {
     auto StackId = getStackId(*ContextIter);
+    if (IsImportant)
+      ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
     ContextNode *StackNode = getNodeForStackId(StackId);
     if (!StackNode) {
       StackNode = createNewNode(/*IsAllocation=*/false);
@@ -1600,11 +1695,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(
-        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-        DenseMap<uint64_t, std::vector<CallContextInfo>>
-            &StackIdToMatchingCalls,
-        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
+    assignStackNodesPostOrder(ContextNode *Node,
+                              DenseSet<const ContextNode *> &Visited,
+                              DenseMap<uint64_t, std::vector<CallContextInfo>>
+                                  &StackIdToMatchingCalls,
+                              DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+                              const DenseSet<uint32_t> &ImportantContextIds) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1620,7 +1716,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       continue;
     }
     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1648,6 +1744,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       Node->setCall(Call);
       NonAllocationCallToContextNodeMap[Call] = Node;
       NodeToCallingFunc[Node] = Func;
+      recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
       return;
     }
   }
@@ -1786,6 +1883,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
                                 : CurNode->computeAllocType();
       PrevNode = CurNode;
     }
+
+    recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
+
     if (VerifyNodes) {
       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
       for (auto Id : Ids) {
@@ -1798,6 +1898,122 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   }
 }
 
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+                          CallTy>::fixupImportantContexts() {
+  if (ImportantContextIdInfo.empty())
+    return;
+
+  // Update statistics as we are done building this map at this point.
+  NumImportantContextIds = ImportantContextIdInfo.size();
+
+  if (!MemProfFixupImportant)
+    return;
+
+  if (ExportToDot)
+    exportToDot("beforestackfixup");
+
+  // For each context we identified as important, walk through the saved context
+  // stack ids in order from leaf upwards, and make sure all edges are correct.
+  // These can be difficult to get right when updating the graph while mapping
+  // nodes onto summary or IR, especially when there is recursion. In
+  // particular, when we have created new nodes to reflect inlining, it is
+  // sometimes impossible to know exactly how to update the edges in the face of
+  // recursion, as we have lost the original ordering of the stack ids in the
+  // contexts.
+  // TODO: Consider only doing this if we detect the context has recursive
+  // cycles.
+  //
+  // I.e. assume we have a context with stack ids like: {A B A C A D E}
+  // and let's say A was inlined into B, C, and D. The original graph will have
+  // multiple recursive cycles through A. When we match the original context
+  // nodes onto the IR or summary, we will merge {A B} into one context node,
+  // {A C} onto another, and {A D} onto another. Looking at the stack sequence
+  // above, we should end up with a non-cyclic set of edges like:
+  // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
+  // original ordering, we won't get the edges correct initially (it's
+  // impossible without the original ordering). Here we do the fixup (add and
+  // removing edges where necessary) for this context. In the
+  // ImportantContextInfo struct in this case we should have a MaxLength = 2,
+  // and map entries for {A B}, {A C}, {A D}, and {E}.
+  for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
+    if (Info.StackIdsToNode.empty())
+      continue;
+    bool Changed = false;
+    ContextNode *PrevNode = nullptr;
+    ContextNode *CurNode = nullptr;
+    DenseSet<const ContextEdge *> VisitedEdges;
+    ArrayRef<uint64_t> AllStackIds(Info.StackIds);
+    // Try to identify what callsite ContextNode maps to which slice of the
+    // context's ordered stack ids.
+    for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
+      // We will do this greedily, trying up to MaxLength stack ids in a row, to
+      // see if we recorded a context node for that sequence.
+      auto Len = Info.MaxLength;
+      auto LenToEnd = AllStackIds.size() - I;
+      if (Len > LenToEnd)
+        Len = LenToEnd;
+      CurNode = nullptr;
+      // Try to find a recorded context node starting with the longest length
+      // recorded, and on down until we check for just a single stack node.
+      for (; Len > 0; Len--) {
+        // Get the slice of the original stack id sequence to check.
+        auto CheckStackIds = AllStackIds.slice(I, Len);
+        auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
+        if (EntryIt == Info.StackIdsToNode.end())
+          continue;
+        CurNode = EntryIt->second;
+        // Skip forward so we don't try to look for the ones we just matched.
+        // We increment by Len - 1, because the outer for loop will increment I.
+        I += Len - 1;
+        break;
+      }
+      // Give up if we couldn't find a node. Since we need to clone from the
+      // leaf allocation upwards, no sense in doing anymore fixup further up
+      // the context if we couldn't match part of the original stack context
+      // onto a callsite node.
+      if (!CurNode)
+        break;
+      // No edges to fix up until we have a pair of nodes that should be
+      // adjacent in the graph.
+      if (!PrevNode)
+        continue;
+      // See if we already have a call edge from CurNode to PrevNode.
+      auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
+      if (CurEdge) {
+        // We already have an edge. Make sure it contains this context id.
+        if (CurEdge->getContextIds().insert(CurContextId).second) {
+          NumFixupEdgeIdsInserted++;
+          Changed = true;
+        }
+      } else {
+        // No edge exists - add one.
+        NumFixupEdgesAdded++;
+        DenseSet<uint32_t> ContextIds({CurContextId});
+        auto AllocType = computeAllocType(ContextIds);
+        auto NewEdge = std::make_shared<ContextEdge>(
+            PrevNode, CurNode, AllocType, std::move(ContextIds));
+        PrevNode->CallerEdges.push_back(NewEdge);
+        CurNode->CalleeEdges.push_back(NewEdge);
+        // Save the new edge for the below handling.
+        CurEdge = NewEdge.get();
+        Changed = true;
+      }
+      VisitedEdges.insert(CurEdge);
+      // Now remove this context id from any other caller edges calling
+      // PrevNode.
+      for (auto &Edge : PrevNode->CallerEdges) {
+        // Skip the edge updating/created above and edges we have already
+        // visited (due to recursion).
+        if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
+          Edge->getContextIds().erase(CurContextId);
+      }
+    }
+    if (Changed)
+      NumFixedContexts++;
+  }
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // Map of stack id to all calls with that as the last (outermost caller)
@@ -2043,9 +2259,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // nodes representing any inlining at interior callsites. Note we move the
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
+  DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
+                                         ImportantContextIdInfo.keys());
   for (auto &Entry : AllocationCallToContextNodeMap)
     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
+
+  fixupImportantContexts();
+
   if (VerifyCCG)
     check();
 }
@@ -2155,6 +2376,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
     Module &M,
     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
     : Mod(M), OREGetter(OREGetter) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &F : M) {
     std::vector<CallInfo> CallsWithMetadata;
     for (auto &BB : F) {
@@ -2191,7 +2416,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), ContextSizeInfo);
+                getMIBAllocType(MIBMD), ContextSizeInfo,
+                TotalSizeToContextIdTopNCold);
           }
           // If exporting the graph to dot and an allocation id of interest was
           // specified, record all the context ids for this allocation node.
@@ -2241,6 +2467,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing)
     : Index(Index), isPrevailing(isPrevailing) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &I : Index) {
     auto VI = Index.getValueInfo(I);
     for (auto &S : VI.getSummaryList()) {
@@ -2288,7 +2518,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
             }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                ContextSizeInfo);
+                ContextSizeInfo, TotalSizeToContextIdTopNCold);
             I++;
           }
           // If exporting the graph to dot and an allocation id of interest was
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 5dc3175382254..f533a47150a7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -63,9 +63,11 @@ class VPBuilder {
   }
 
   VPInstruction *createInstruction(unsigned Opcode,
-                                   ArrayRef<VPValue *> Operands, DebugLoc DL,
+                                   ArrayRef<VPValue *> Operands,
+                                   const VPIRMetadata &MD, DebugLoc DL,
                                    const Twine &Name = "") {
-    return tryInsertInstruction(new VPInstruction(Opcode, Operands, DL, Name));
+    return tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
   }
 
 public:
@@ -150,17 +152,17 @@ class VPBuilder {
   /// its underlying Instruction.
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
+                              const VPIRMetadata &MD = {},
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
-    DebugLoc DL = DebugLoc::getUnknown();
-    if (Inst)
-      DL = Inst->getDebugLoc();
-    VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
+    VPInstruction *NewVPInst = tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               DebugLoc DL, const Twine &Name = "") {
-    return createInstruction(Opcode, Operands, DL, Name);
+    return createInstruction(Opcode, Operands, {}, DL, Name);
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               const VPIRFlags &Flags,
@@ -174,8 +176,8 @@ class VPBuilder {
                               Type *ResultTy, const VPIRFlags &Flags = {},
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
+    return tryInsertInstruction(new VPInstructionWithType(
+        Opcode, Operands, ResultTy, Flags, {}, DL, Name));
   }
 
   VPInstruction *createOverflowingOp(
@@ -189,13 +191,14 @@ class VPBuilder {
   VPInstruction *createNot(VPValue *Operand,
                            DebugLoc DL = DebugLoc::getUnknown(),
                            const Twine &Name = "") {
-    return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
+    return createInstruction(VPInstruction::Not, {Operand}, {}, DL, Name);
   }
 
   VPInstruction *createAnd(VPValue *LHS, VPValue *RHS,
                            DebugLoc DL = DebugLoc::getUnknown(),
                            const Twine &Name = "") {
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
+    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, {}, DL,
+                             Name);
   }
 
   VPInstruction *createOr(VPValue *LHS, VPValue *RHS,
@@ -210,20 +213,18 @@ class VPBuilder {
   VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
                                   DebugLoc DL = DebugLoc::getUnknown(),
                                   const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name));
+    return createNaryOp(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name);
   }
 
   VPInstruction *
   createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "",
                std::optional<FastMathFlags> FMFs = std::nullopt) {
-    auto *Select =
-        FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 *FMFs, {}, DL, Name)
-             : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 DL, Name);
-    return tryInsertInstruction(Select);
+    if (!FMFs)
+      return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
+                          Name);
+    return tryInsertInstruction(new VPInstruction(
+        Instruction::Select, {Cond, TrueVal, FalseVal}, *FMFs, {}, DL, Name));
   }
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
@@ -306,7 +307,7 @@ class VPBuilder {
                                   const VPIRFlags &Flags = {},
                                   const VPIRMetadata &Metadata = {}) {
     return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata));
+        new VPInstructionWithType(Opcode, Op, ResultTy, Flags, Metadata, DL));
   }
 
   VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 10bd6cd471152..356d759b94799 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7616,14 +7616,13 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
   }
   if (VPI->getOpcode() == Instruction::Load) {
     auto *Load = cast<LoadInst>(I);
-    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
+    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
-                                Consecutive, Reverse,
-                                VPIRMetadata(*Store, LVer), VPI->getDebugLoc());
+                                Consecutive, Reverse, *VPI, VPI->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
@@ -7751,7 +7750,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
+    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI,
                                       VPI->getDebugLoc());
 
   Function *Variant = nullptr;
@@ -7843,7 +7842,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       auto *SafeRHS =
           Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
       Ops[1] = SafeRHS;
-      return new VPWidenRecipe(*I, Ops);
+      return new VPWidenRecipe(*I, Ops, *VPI, VPI->getDebugLoc());
     }
     [[fallthrough]];
   }
@@ -7889,7 +7888,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       // For other binops, the legacy cost model only checks the second operand.
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
-    return new VPWidenRecipe(*I, NewOps);
+    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(VPI->operands());
@@ -7897,7 +7896,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
     NewOps.push_back(Plan.getConstantInt(32, Idx));
-    return new VPWidenRecipe(*I, NewOps);
+    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
   }
   };
 }
@@ -7981,8 +7980,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
-  auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform,
-                                       BlockInMask, VPIRMetadata(*I, LVer));
+  auto *Recipe =
+      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI);
   return Recipe;
 }
 
@@ -8235,13 +8234,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
 
   if (VPI->getOpcode() == Instruction::Select)
-    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands());
+    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands(),
+                                   *VPI);
 
   if (Instruction::isCast(VPI->getOpcode())) {
     auto *CastR = cast<VPInstructionWithType>(R);
     auto *CI = cast<CastInst>(Instr);
     return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
-                                 CastR->getResultType(), *CI);
+                                 CastR->getResultType(), *CI, *VPI);
   }
 
   return tryToWiden(VPI);
@@ -8269,7 +8269,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*ReductionI, Ops);
+    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRMetadata(),
+                              ReductionI->getDebugLoc());
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
@@ -8302,7 +8303,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   // candidates built later for specific VF ranges.
   auto VPlan0 = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8408,7 +8409,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // VPInstructions in the loop.
   // ---------------------------------------------------------------------------
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder, BlockMaskCache, LVer);
+                                Builder, BlockMaskCache);
   // TODO: Handle partial reductions with EVL tail folding.
   if (!CM.foldTailWithEVL())
     RecipeBuilder.collectScaledReductions(Range);
@@ -8453,9 +8454,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
         if (Legal->isInvariantStoreOfReduction(SI)) {
-          auto *Recipe =
-              new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
-                                    nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
+          auto *Recipe = new VPReplicateRecipe(
+              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/,
+              *cast<VPInstruction>(SingleDef));
           Recipe->insertBefore(*MiddleVPBB, MBIP);
         }
         R.eraseFromParent();
@@ -8606,7 +8607,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   // addScalarResumePhis.
   DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder, BlockMaskCache, nullptr /*LVer*/);
+                                Builder, BlockMaskCache);
   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index a7000aff06379..87280b83fc0e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -84,10 +84,6 @@ class VPRecipeBuilder {
   /// A mapping of partial reduction exit instructions to their scaling factor.
   DenseMap<const Instruction *, unsigned> ScaledReductionMap;
 
-  /// Loop versioning instance for getting noalias metadata guaranteed by
-  /// runtime checks.
-  LoopVersioning *LVer;
-
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -144,11 +140,9 @@ class VPRecipeBuilder {
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder,
-                  DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
-                  LoopVersioning *LVer)
+                  DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache)
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
-        CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache),
-        LVer(LVer) {}
+        CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache) {}
 
   std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
     auto It = ScaledReductionMap.find(ExitInst);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0932922c07126..c81834e401726 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -65,7 +65,6 @@ class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
 class LoopVectorizationCostModel;
-class LoopVersioning;
 
 struct VPCostContext;
 
@@ -958,10 +957,6 @@ class VPIRMetadata {
   /// \p I.
   VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
 
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
   /// Copy constructor for cloning.
   VPIRMetadata(const VPIRMetadata &Other) = default;
 
@@ -970,14 +965,17 @@ class VPIRMetadata {
   /// Add all metadata to \p I.
   void applyMetadata(Instruction &I) const;
 
-  /// Add metadata with kind \p Kind and \p Node.
-  void addMetadata(unsigned Kind, MDNode *Node) {
-    assert(none_of(Metadata,
-                   [Kind](const std::pair<unsigned, MDNode *> &P) {
-                     return P.first == Kind;
-                   }) &&
-           "Kind must appear at most once in Metadata");
-    Metadata.emplace_back(Kind, Node);
+  /// Set metadata with kind \p Kind to \p Node. If metadata with \p Kind
+  /// already exists, it will be replaced. Otherwise, it will be added.
+  void setMetadata(unsigned Kind, MDNode *Node) {
+    auto It =
+        llvm::find_if(Metadata, [Kind](const std::pair<unsigned, MDNode *> &P) {
+          return P.first == Kind;
+        });
+    if (It != Metadata.end())
+      It->second = Node;
+    else
+      Metadata.emplace_back(Kind, Node);
   }
 
   /// Intersect this VPIRMetada object with \p MD, keeping only metadata
@@ -1117,11 +1115,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, {}, {}, DL, Name) {}
-
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, const VPIRMetadata &MD = {},
+                const VPIRFlags &Flags = {}, const VPIRMetadata &MD = {},
                 DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "");
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
@@ -1211,14 +1205,10 @@ class VPInstructionWithType : public VPInstruction {
 
 public:
   VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL,
+                        Type *ResultTy, const VPIRFlags &Flags = {},
+                        const VPIRMetadata &Metadata = {},
+                        DebugLoc DL = DebugLoc::getUnknown(),
                         const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, Flags, {}, DL, Name),
-        ResultTy(ResultTy) {}
-
-  VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags,
-                        const VPIRMetadata &Metadata, const Twine &Name = "")
       : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
         ResultTy(ResultTy) {}
 
@@ -1247,7 +1237,7 @@ class VPInstructionWithType : public VPInstruction {
   VPInstruction *clone() override {
     auto *New =
         new VPInstructionWithType(getOpcode(), operands(), getResultType(),
-                                  *this, getDebugLoc(), getName());
+                                  *this, *this, getDebugLoc(), getName());
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }
@@ -1331,7 +1321,7 @@ class VPPhiAccessors {
 
 struct LLVM_ABI_FOR_TEST VPPhi : public VPInstruction, public VPPhiAccessors {
   VPPhi(ArrayRef<VPValue *> Operands, DebugLoc DL, const Twine &Name = "")
-      : VPInstruction(Instruction::PHI, Operands, DL, Name) {}
+      : VPInstruction(Instruction::PHI, Operands, {}, {}, DL, Name) {}
 
   static inline bool classof(const VPUser *U) {
     auto *VPI = dyn_cast<VPInstruction>(U);
@@ -1475,9 +1465,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
         VPIRMetadata(Metadata), Opcode(Opcode) {}
 
-  VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I),
-        Opcode(I.getOpcode()) {}
+  VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands,
+                const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
+        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {}
 
   ~VPWidenRecipe() override = default;
 
@@ -1518,13 +1509,12 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    CastInst &UI)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPIRMetadata(UI),
-        Opcode(Opcode), ResultTy(ResultTy) {
+                    CastInst &UI, const VPIRMetadata &Metadata)
+      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI),
+        VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
   }
-
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
                     const VPIRFlags &Flags = {},
                     const VPIRMetadata &Metadata = {},
@@ -1587,18 +1577,23 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 public:
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRMetadata &MD = {},
                          DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
-        VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
+        VPIRMetadata(MD), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
         MayWriteToMemory(CI.mayWriteToMemory()),
         MayHaveSideEffects(CI.mayHaveSideEffects()) {}
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRFlags &Flags = {},
+                         const VPIRMetadata &Metadata = {},
                          DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
-        VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, Flags,
+                            DL),
+        VPIRMetadata(Metadata), VectorIntrinsicID(VectorIntrinsicID),
+        ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeSet Attrs = Intrinsic::getFnAttributes(Ctx, VectorIntrinsicID);
     MemoryEffects ME = Attrs.getMemoryEffects();
@@ -1614,9 +1609,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   VPWidenIntrinsicRecipe *clone() override {
     if (Value *CI = getUnderlyingValue())
       return new VPWidenIntrinsicRecipe(*cast<CallInst>(CI), VectorIntrinsicID,
-                                        operands(), ResultTy, getDebugLoc());
+                                        operands(), ResultTy, *this,
+                                        getDebugLoc());
     return new VPWidenIntrinsicRecipe(VectorIntrinsicID, operands(), ResultTy,
-                                      getDebugLoc());
+                                      *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC)
@@ -1757,15 +1753,16 @@ class VPHistogramRecipe : public VPRecipeBase {
 /// instruction.
 struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
                                                public VPIRMetadata {
-  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands)
+  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands,
+                      const VPIRMetadata &MD = {})
       : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I),
-        VPIRMetadata(I) {}
+        VPIRMetadata(MD) {}
 
   ~VPWidenSelectRecipe() override = default;
 
   VPWidenSelectRecipe *clone() override {
     return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
-                                   operands());
+                                   operands(), *this);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4ffd5577d31a4..612202d049774 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 
 #define DEBUG_TYPE "vplan"
 
@@ -37,6 +38,9 @@ class PlainCFGBuilder {
   // Loop Info analysis.
   LoopInfo *LI;
 
+  // Loop versioning for alias metadata.
+  LoopVersioning *LVer;
+
   // Vectorization plan that we are working on.
   std::unique_ptr<VPlan> Plan;
 
@@ -65,8 +69,8 @@ class PlainCFGBuilder {
   void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
 
 public:
-  PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
-      : TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
+  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer)
+      : TheLoop(Lp), LI(LI), LVer(LVer), Plan(std::make_unique<VPlan>(Lp)) {}
 
   /// Build plain CFG for TheLoop and connect it to Plan's entry.
   std::unique_ptr<VPlan> buildPlainCFG();
@@ -186,7 +190,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       // recipes.
       if (Br->isConditional()) {
         VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst,
+                                 VPIRMetadata(*Inst), Inst->getDebugLoc());
       }
 
       // Skip the rest of the Instruction processing for Branch instructions.
@@ -200,7 +205,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
       for (auto Case : SI->cases())
         Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
-      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst,
+                               VPIRMetadata(*Inst), Inst->getDebugLoc());
       continue;
     }
 
@@ -228,6 +234,18 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
               VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
       }
     } else {
+      // Build VPIRMetadata from the instruction and add loop versioning
+      // metadata for loads and stores.
+      VPIRMetadata MD(*Inst);
+      if (isa<LoadInst, StoreInst>(Inst) && LVer) {
+        const auto &[AliasScopeMD, NoAliasMD] =
+            LVer->getNoAliasMetadataFor(Inst);
+        if (AliasScopeMD)
+          MD.setMetadata(LLVMContext::MD_alias_scope, AliasScopeMD);
+        if (NoAliasMD)
+          MD.setMetadata(LLVMContext::MD_noalias, NoAliasMD);
+      }
+
       // Translate LLVM-IR operands into VPValue operands and set them in the
       // new VPInstruction.
       SmallVector<VPValue *, 4> VPOperands;
@@ -236,12 +254,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
 
       if (auto *CI = dyn_cast<CastInst>(Inst)) {
         NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0],
-                                            CI->getType(), CI->getDebugLoc());
+                                            CI->getType(), CI->getDebugLoc(),
+                                            {}, MD);
         NewR->setUnderlyingValue(CI);
       } else {
         // Build VPInstruction for any arbitrary Instruction without specific
         // representation in VPlan.
-        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst);
+        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst, MD,
+                                        Inst->getDebugLoc());
       }
     }
 
@@ -537,8 +557,9 @@ static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
 
 std::unique_ptr<VPlan>
 VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
-                             DebugLoc IVDL, PredicatedScalarEvolution &PSE) {
-  PlainCFGBuilder Builder(TheLoop, &LI);
+                             DebugLoc IVDL, PredicatedScalarEvolution &PSE,
+                             LoopVersioning *LVer) {
+  PlainCFGBuilder Builder(TheLoop, &LI, LVer);
   std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
   addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop);
   return VPlan0;
@@ -672,7 +693,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights =
         MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
-    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+    Term->setMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
 
@@ -756,7 +777,7 @@ void VPlanTransforms::addMinimumIterationCheck(
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights = MDB.createBranchWeights(
         ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
-    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+    Term->setMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
 
@@ -793,7 +814,7 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
   MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights(Weights, /*IsExpected=*/false);
-  Branch->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  Branch->setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
 /// If \p RedPhiR is used by a ComputeReductionResult recipe, return it.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e2a8e495d5ed5..fca6554ad77c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <cassert>
 
 using namespace llvm;
@@ -1674,17 +1673,6 @@ void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
-    : VPIRMetadata(I) {
-  if (!LVer || !isa<LoadInst, StoreInst>(&I))
-    return;
-  const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
-  if (AliasScopeMD)
-    Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
-  if (NoAliasMD)
-    Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
-}
-
 void VPIRMetadata::applyMetadata(Instruction &I) const {
   for (const auto &[Kind, Node] : Metadata)
     I.setMetadata(Kind, Node);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 1453c6623625b..3b5cc9fcb9820 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -517,7 +517,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
 
   assert(CombinedOperands.size() > 0 && "Need more some operands");
   auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
-  auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
+  auto *VPI =
+      new VPInstruction(Opcode, CombinedOperands, {}, {}, Inst->getDebugLoc());
 
   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
                     << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index bbeb447de45cb..89118b49bed44 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -85,20 +85,19 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
               Ingredient.getDebugLoc());
         }
       } else {
-        assert(isa<VPInstruction>(&Ingredient) &&
-               "only VPInstructions expected here");
+        auto *VPI = cast<VPInstruction>(&Ingredient);
         assert(!isa<PHINode>(Inst) && "phis should be handled above");
         // Create VPWidenMemoryRecipe for loads and stores.
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+              false /*Consecutive*/, false /*Reverse*/, *VPI,
               Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
-              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              VPIRMetadata(*Store), Ingredient.getDebugLoc());
+              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
+              Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -107,15 +106,17 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
             return false;
           NewRecipe = new VPWidenIntrinsicRecipe(
               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
-              drop_end(Ingredient.operands()), CI->getType(),
+              drop_end(Ingredient.operands()), CI->getType(), *VPI,
               CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
-          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
+          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands(), *VPI);
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
-          NewRecipe = new VPWidenCastRecipe(
-              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
+          NewRecipe =
+              new VPWidenCastRecipe(CI->getOpcode(), Ingredient.getOperand(0),
+                                    CI->getType(), *CI, *VPI);
         } else {
-          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
+          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
+                                        Ingredient.getDebugLoc());
         }
       }
 
@@ -1705,8 +1706,9 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
       Ops.append({ALM, Plan.getOrAddLiveIn(
                            ConstantInt::get(IntegerType::getInt64Ty(Ctx),
                                             VF.getKnownMinValue() * Part))});
-      auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
-                                             IntegerType::getInt1Ty(Ctx), DL);
+      auto *Ext =
+          new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
+                                     IntegerType::getInt1Ty(Ctx), {}, {}, DL);
       Extracts[Part] = Ext;
       Ext->insertAfter(ALM);
     }
@@ -1845,7 +1847,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
     // The vector region contains header phis for which we cannot remove the
     // loop region yet.
     auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
-                                  Term->getDebugLoc());
+                                  {}, {}, Term->getDebugLoc());
     ExitingVPBB->appendRecipe(BOC);
   }
 
@@ -2679,13 +2681,13 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
             m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
         Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
-        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+        TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
 
   if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
                                  m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
         Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
-        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+        TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
 
   return nullptr;
 }
@@ -2753,7 +2755,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
             {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
-            TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
+            TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
+            R.getDebugLoc());
         VPSplice->insertBefore(&R);
         R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
         ToErase.push_back(&R);
@@ -4458,7 +4461,7 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
   MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
-  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e3bde8a47dcbc..a44a4f69c917b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class InductionDescriptor;
 class Instruction;
+class LoopVersioning;
 class PHINode;
 class ScalarEvolution;
 class PredicatedScalarEvolution;
@@ -99,7 +100,7 @@ struct VPlanTransforms {
   ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
   LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
   buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
-              PredicatedScalarEvolution &PSE);
+              PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
 
   /// Update \p Plan to account for all early exits.
   LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad0..0ef2b31d00daa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) {
 define <2 x i16> @test1(ptr %v2i16_ptr) {
 ; CHECK-LE-LABEL: test1:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT:    add x8, x0, #2
-; CHECK-LE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test1:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT:    add x8, x0, #2
-; CHECK-BE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) {
 define <2 x i8> @test3(ptr %v2i8_ptr) {
 ; CHECK-LE-LABEL: test3:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT:    add x8, x0, #1
-; CHECK-LE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test3:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT:    add x8, x0, #1
-; CHECK-BE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) {
 define <2 x i32> @fsext_v2i32(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i32:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ldrsb w8, [x0]
-; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i32:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldrsb w8, [x0]
-; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) {
 define <2 x i16> @fsext_v2i16(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i16:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ldrsb w8, [x0]
-; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i16:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldrsb w8, [x0]
-; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -497,3 +495,213 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric
   %v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
   ret <4 x i8> %v4i8
 }
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i16>
+  ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = zext <2 x i16> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = zext <2 x i16> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <4 x i32> @zext_v4i16_v4i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v4i16_v4i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v4i16_v4i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <4 x i16>, ptr %a
+  %y = zext <4 x i16> %x to <4 x i32>
+  ret <4 x i32> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = sext <2 x i8> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = sext <2 x i16> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = sext <2 x i16> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <4 x i32> @sext_v4i16_v4i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v4i16_v4i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v4i16_v4i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <4 x i16>, ptr %a
+  %y = sext <4 x i16> %x to <4 x i32>
+  ret <4 x i32> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index e85e808921c87..a302ddf483caa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -219,21 +219,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
-; CHECK-NEON-NEXT:    ldr h0, [x0]
+; CHECK-NEON-NEXT:    ldr s0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
-; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
-; CHECK-SVE-NEXT:    ldr h0, [x0]
+; CHECK-SVE-NEXT:    ldr s0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    mov v0.d[1], x8
-; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index 96168cb80196f..7502db4c5aa93 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -56,13 +56,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -228,13 +225,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index a7875dbebd0e6..d8d003c85eed6 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -176,12 +176,12 @@ entry:
 define void @and_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: and_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v0.4h
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -212,12 +212,12 @@ entry:
 define void @or_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: or_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -248,12 +248,12 @@ entry:
 define void @xor_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: xor_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -698,12 +695,10 @@ entry:
 define void @and_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: and_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
@@ -734,12 +729,10 @@ entry:
 define void @or_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: or_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
@@ -770,12 +763,10 @@ entry:
 define void @xor_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: xor_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 20f19fddf790a..002e6cd509bec 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    add x8, sp, #12
 ; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT:    str s0, [sp, #12]
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x8]
-; CHECK-SD-NEXT:    orr x8, x8, #0x2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 04124609eec74..b1b869ec9e1ff 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -6,11 +6,10 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
+; CHECK-SD-NEXT:    ldr h1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2s, #24
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v1.s[1], w9
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    clz v1.2s, v1.2s
 ; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
@@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -145,11 +143,9 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SD-NEXT:    ldr s1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2s, #16
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v1.s[1], w9
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    clz v1.2s, v1.2s
 ; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index d547b6bec5b83..9c59f1b233b5d 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -6,10 +6,9 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
@@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -143,10 +141,8 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index fc9bf2c0aca65..c9181b4c312d1 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -6,10 +6,10 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    ldr h0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2s, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #8
 ; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
@@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -219,10 +218,9 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    ldr s0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2s, #1
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #16
 ; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7278e40..d18cff51c6101 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) {
 define <2 x i16> @std_v2i8_v2i16(ptr %p) {
 ; CHECK-LABEL: std_v2i8_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrb w9, [x0, #3]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    mov v1.s[1], w9
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #3
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %l1 = load <2 x i8>, ptr %p
   %q = getelementptr i8, ptr %p, i32 2
@@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x0, #4]
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ldr s0, [x0, #4]
+; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
-; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %l1b = load volatile float, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
new file mode 100644
index 0000000000000..6a10df68ddc71
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
@@ -0,0 +1,197 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s
+
+# This test verifies that cross-bank copies (e.g., GPR to FPR, FPR to GPR)
+# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.
+
+--- |
+  declare void @use_float(float)
+  declare void @use_int(i32)
+
+  define void @gpr_to_fpr_virtual_copy_hoisted() {
+    ret void
+  }
+
+  define void @gpr_to_fpr_physical_copy_hoisted() {
+    ret void
+  }
+
+  define void @fpr_to_gpr_virtual_copy_hoisted() {
+    ret void
+  }
+...
+---
+name: gpr_to_fpr_virtual_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: gpr_to_fpr_virtual_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $s0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $w1
+    %1:gpr32 = COPY $w0
+    %0:gpr32 = COPY $w1
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY %0:gpr32
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
+---
+name: gpr_to_fpr_physical_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: gpr_to_fpr_physical_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $s0 = COPY [[COPY3]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0
+    %1:gpr32 = COPY $w0
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY $wzr
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
+---
+name: fpr_to_gpr_virtual_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: fpr_to_gpr_virtual_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $s0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:fpr32 = COPY $s0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $w0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $s0
+    %1:gpr32 = COPY $w0
+    %0:fpr32 = COPY $s0
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:gpr32 = COPY %0:fpr32
+    $w0 = COPY %7:gpr32
+    BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
index 4bf65e7d6fd08..cb042757a4a42 100644
--- a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmhhs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhws
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhxs
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
@@ -55,5 +61,3 @@ entry:
   %0 = tail call i64 @llvm.llround.i64.f16(half %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llround.i64.f16(half) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/llround-conv.ll b/llvm/test/CodeGen/AArch64/llround-conv.ll
index 797136037f0e9..4cc089804ce97 100644
--- a/llvm/test/CodeGen/AArch64/llround-conv.ll
+++ b/llvm/test/CodeGen/AArch64/llround-conv.ll
@@ -1,60 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmswl
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmsll
 
-; CHECK-LABEL: testmsws:
-; CHECK:       fcvtas  x0, s0
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       fcvtas  x0, s0
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       fcvtas  x0, d0
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       fcvtas  x0, d0
-; CHECK-NEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      llroundl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl llroundl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f128(fp128 %x)
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       llroundl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b llroundl
 entry:
-  %0 = tail call i64 @llvm.llround.f128(fp128 %x)
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
-declare i64 @llvm.llround.f128(fp128) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e37d6eaf..b138fa4085427 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) {
 define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
 ; CHECK-SD-LABEL: load_v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) {
 define <2 x i16> @load_v2i16(ptr %ptr) {
 ; CHECK-SD-LABEL: load_v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
index bf78fd456eac0..a29dea0eb9f9f 100644
--- a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmhhs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhws
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhxs
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
@@ -55,5 +61,3 @@ entry:
   %0 = tail call i64 @llvm.lround.i64.f16(half %x)
   ret i64 %0
 }
-
-declare i64 @llvm.lround.i64.f16(half) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/lround-conv.ll b/llvm/test/CodeGen/AArch64/lround-conv.ll
index 678d3149f20cc..0bf82b538e70c 100644
--- a/llvm/test/CodeGen/AArch64/lround-conv.ll
+++ b/llvm/test/CodeGen/AArch64/lround-conv.ll
@@ -1,60 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmswl
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmsll
 
-; CHECK-LABEL: testmsws:
-; CHECK:       fcvtas  x0, s0
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       fcvtas  x0, s0
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       fcvtas  x0, d0
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       fcvtas  x0, d0
-; CHECK-NEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      lroundl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl lroundl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       lroundl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b lroundl
 entry:
   %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.lround.i64.f32(float) nounwind readnone
-declare i64 @llvm.lround.i64.f64(double) nounwind readnone
-declare i64 @llvm.lround.i64.f128(fp128) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9c69a6f03b858..475bd22c6ebcb 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -68,13 +68,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -240,13 +237,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 1c4a504d0ab70..b31a5ea0b5d79 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
index 3e708b0678fbc..297b25ed075e4 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
@@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB3_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x0, x8, lsl #1
-; CHECK-NEXT:    ldrsb w10, [x9]
-; CHECK-NEXT:    ldrsb w9, [x9, #1]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ldr h0, [x0, x8, lsl #1]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    str q0, [x1, x8, lsl #4]
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 3af858713525b..02eb40b412efd 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 3a9f12b838702..1dc55fccc3dac 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    strb w2, [x3, #2]
 ; CHECK-SD-NEXT:    mov v0.h[1], w1
 ; CHECK-SD-NEXT:    mov v0.h[2], w2
 ; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
-; CHECK-SD-NEXT:    str s0, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
-; CHECK-SD-NEXT:    strb w2, [x3, #2]
-; CHECK-SD-NEXT:    strh w8, [x3]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    str h0, [x3]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 5e278d59b6591..dd920b98e18eb 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -56,13 +56,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    usubl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -228,13 +225,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    usubl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index ba7bee9a94bac..a77c74ab67b80 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -7,8 +7,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_zext_v4i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
@@ -97,8 +99,10 @@ define void @load_zext_v64i16i32(ptr %ap, ptr %b) #0 {
 define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_sext_v4i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = sext <4 x i16> %a to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 6fd5b820a2242..b457e0307fbe1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
@@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ed03f9b322432..4fb3bf7392d4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
@@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 3cfb24aaccb11..cd02d18e61643 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x1]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
 ; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT:    ldrb w10, [x0, #1]
-; CHECK-SD-NEXT:    ldrb w11, [x1, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x2]
@@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
 ; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
-; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a71cf95a728db..ef70137e6deee 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x1]
-; CHECK-SD-NEXT:    ldrb w10, [x0, #1]
-; CHECK-SD-NEXT:    ldrb w11, [x1, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x2]
@@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x1]
-; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
-; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index f6553b6acec9d..6d4061fb02cff 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
-; CHECK-LABEL: convert_v3f32
-; CHECK: strb
-; CHECK: strh
 define void @convert_v3f32() {
+; CHECK-LABEL: convert_v3f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    strb wzr, [x8]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    str h0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 entry:
   br label %bb
 
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 80029fb717575..ee74984125f77 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-SD-NEXT:    shl.16b v0, v0, #7
 ; CHECK-SD-NEXT:    adrp x8, lCPI20_0@PAGE
 ; CHECK-SD-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; CHECK-SD-NEXT:    add x8, sp, #14
 ; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
 ; CHECK-SD-NEXT:    and.16b v0, v0, v1
 ; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
 ; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
 ; CHECK-SD-NEXT:    addv.8h h0, v0
-; CHECK-SD-NEXT:    str h0, [sp, #14]
-; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-SD-NEXT:    orr x8, x8, #0x1
-; CHECK-SD-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT:    ushll.8h v0, v0, #0
+; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-SD-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 7d3f5bc270d6b..60414adba75fc 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    ldr s0, [x0]
 ; BE-NEXT:    ldrh w8, [x0, #4]
 ; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    strb w8, [x1, #2]
 ; BE-NEXT:    mov v0.h[2], w8
 ; BE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    str s0, [sp, #12]
-; BE-NEXT:    ldrh w9, [sp, #12]
-; BE-NEXT:    strb w8, [x1, #2]
-; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    str h0, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
 entry:
@@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
 entry:
@@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #3]
-; BE-NEXT:    sturh w8, [x1, #1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    stur h1, [x1, #1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #5]
-; BE-NEXT:    sturh w8, [x1, #3]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    stur h1, [x1, #3]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    shrn.4h v0, v0, #16
 ; CHECK-NEXT:    uzp1.8b v1, v0, v0
 ; CHECK-NEXT:    mov h0, v0[2]
-; CHECK-NEXT:    str s1, [sp, #12]
-; CHECK-NEXT:    ldrh w8, [sp, #12]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    stur b0, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    str h1, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
@@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #8]
-; BE-NEXT:    ldrh w8, [sp, #8]
 ; BE-NEXT:    stur b0, [x0, #2]
-; BE-NEXT:    strh w8, [x0]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x0]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i8>, ptr %src, align 1
@@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #8]
-; BE-NEXT:    ldrh w8, [sp, #8]
 ; BE-NEXT:    stur b0, [x0, #2]
-; BE-NEXT:    strh w8, [x0]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x0]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i8>, ptr %src, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
new file mode 100644
index 0000000000000..84ac58f899717
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fmul_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_uniform:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_uniform:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul half %a, %b
+  ret half %result
+}
+
+define amdgpu_ps half @fmul_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_div:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_div:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fmul_s16_div:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fmul_s16_div:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+  %result = fmul half %a, %b
+  ret half %result
+}
+
+define amdgpu_ps float @fmul_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fmul_s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul float %a, %b
+  ret float %result
+}
+
+define amdgpu_ps float @fmul_s32_div(float %a, float %b) {
+; GCN-LABEL: fmul_s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul float %a, %b
+  ret float %result
+}
+
+define amdgpu_ps void @fmul_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmul_s64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %result = fmul double %a, %b
+  store double %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @fmul_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmul_s64_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    s_endpgm
+  %result = fmul double %a, %b
+  store double %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fmul_v2s16_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_mul_f16 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    s_mul_f16 s1, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x half> %a, %b
+  ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fmul_v2s16_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x half> %a, %b
+  ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fmul_v2s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v0, s0, s2
+; GFX11-NEXT:    v_mul_f32_e64 v1, s1, s3
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f32 s0, s0, s2
+; GFX12-NEXT:    s_mul_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x float> %a, %b
+  ret <2 x float> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fmul_v2s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x float> %a, %b
+  ret <2 x float> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index e03aa18d3147f..1220c0e3b1ead 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
 
+; TODO: Switch test to use -new-reg-bank-select after adding G_FNEG support.
+
 define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16:
 ; GFX9:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
index 5766c05426b2d..f289566a27c12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
 ---
 name: fmul_ss
@@ -17,6 +17,7 @@ body: |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FMUL]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_FMUL %0, %1
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
new file mode 100644
index 0000000000000..479de53dd90f2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: This sample test demonstrates the pretty print feature for NVPTX intrinsics
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+  ; CHECK-LABEL: define void @tcgen05_mma_fp16_cta1(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3)
+
+  ret void
+}
+
+define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+  ; CHECK-LABEL: define void @tcgen05_mma_f8f6f4_cta2(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3)
+
+  ret void
+}
+
+; This test verifies that printImmArg is safe to call on all constant arguments, but only prints comments for arguments that have pretty printing configured.
+define void @test_mixed_constants_edge_case(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor) {
+  ; CHECK-LABEL: define void @test_mixed_constants_edge_case(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, i32 3, i32 1, i32 0)
+
+  ret void
+}
+
+declare void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6), ptr addrspace(6), i64, i32, i1, i32, i32, i32)
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 4ab4ff84dac57..fb26b8b16a290 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -5016,3 +5016,74 @@ define ptr @shl_add_knownbits(ptr %p, i64 %i) {
   %r = getelementptr i8, ptr %p, i64 %shr
   ret ptr %r
 }
+
+define i64 @exactashr1mul6(i64 %a) {
+; RV64I-LABEL: exactashr1mul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactashr1mul6:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactashr1mul6:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = ashr exact i64 %a, 1
+  %d = mul i64 %c, 6
+  ret i64 %d
+}
+
+define i64 @exactlshr3mul22(i64 %a) {
+; RV64I-LABEL: exactlshr3mul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 3
+; RV64I-NEXT:    li a1, 22
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactlshr3mul22:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactlshr3mul22:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    srli a0, a0, 2
+; RV64XANDESPERF-NEXT:    nds.lea.w a1, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a1
+; RV64XANDESPERF-NEXT:    ret
+  %c = lshr exact i64 %a, 3
+  %d = mul i64 %c, 22
+  ret i64 %d
+}
+
+define i64 @exactashr1mul36(i64 %a) {
+; RV64I-LABEL: exactashr1mul36:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactashr1mul36:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a0, a0, 1
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactashr1mul36:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 1
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = ashr exact i64 %a, 1
+  %d = mul i64 %c, 36
+  ret i64 %d
+}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 0fbfb42d2a4dd..78a02b11b17bb 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -68,8 +68,6 @@
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       Compressing EVEX instrs when possible
-; CHECK-NEXT:       X86 Discriminate Memory Operands
-; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       Remove Loads Into Fake Uses
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
deleted file mode 100644
index 6bbf3eb307da3..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom
-  %0 = load i32, ptr %arrayidx, align 4
-  %idxprom1 = sext i32 %pos2 to i64
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       .loc 1 1 0 {{.*}} discriminator 2
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
deleted file mode 100644
index ca412c590b2e3..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-; RUN: llc -x86-discriminate-memops  -x86-bypass-prefetch-instructions=0 < %s | FileCheck %s -check-prefix=NOBYPASS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !9
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
-  call void @llvm.prefetch(ptr %arrayidx2, i32 0, i32 3, i32 1)
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       prefetcht0 (%rdi,%rax,4)
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
-
-;NOBYPASS-LABEL: sum:
-;NOBYPASS:       # %bb.0:
-;NOBYPASS:       prefetcht0 (%rdi,%rax,4)
-;NOBYPASS-NEXT: .loc 1 2 22
-;NOBYPASS-NEXT:  movl (%rdi,%rax,4), %eax
-;NOBYPASS-NEXT:  .loc 1 2 20 {{.*}} discriminator 2  # test.cc:2:20
-;NOBYPASS-NEXT:  addl (%rdi,%rcx,4), %eax
-;NOBYPASS-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
deleted file mode 100644
index a8421d9506a87..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !9
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
deleted file mode 100644
index 935b707ff1072..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
+++ /dev/null
@@ -1,4 +0,0 @@
-caller:0:0
- 2: sum:0
-  3: 0 __prefetch_nta_0:23456
-  3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll b/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
deleted file mode 100644
index 05f542799c08b..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-inline.afdo | FileCheck %s
-;
-; Verify we can insert prefetch instructions in code belonging to inlined
-; functions.
-;
-; ModuleID = 'test.cc'
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @sum(ptr nocapture readonly %arr, i32 %pos1, i32 %pos2) local_unnamed_addr #0 !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !10
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !10
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !10, !tbaa !11
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !15
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !15
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !15, !tbaa !11
-  %add = add nsw i32 %1, %0, !dbg !16
-  ret i32 %add, !dbg !17
-}
-
-; "caller" inlines "sum". The associated .afdo file references instructions
-; in "caller" that came from "sum"'s inlining.
-;
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @caller(ptr nocapture readonly %arr) local_unnamed_addr #0 !dbg !18 {
-entry:
-  %0 = load i32, ptr %arr, align 4, !dbg !19, !tbaa !11
-  %arrayidx2.i = getelementptr inbounds i32, ptr %arr, i64 2, !dbg !21
-  %1 = load i32, ptr %arrayidx2.i, align 4, !dbg !21, !tbaa !11
-  %add.i = add nsw i32 %1, %0, !dbg !22
-  ret i32 %add.i, !dbg !23
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !8, file: !8, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DIFile(filename: "./test.h", directory: "/tmp")
-!9 = !DISubroutineType(types: !2)
-!10 = !DILocation(line: 6, column: 10, scope: !7)
-!11 = !{!12, !12, i64 0}
-!12 = !{!"int", !13, i64 0}
-!13 = !{!"omnipotent char", !14, i64 0}
-!14 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 6, column: 22, scope: !7)
-!16 = !DILocation(line: 6, column: 20, scope: !7)
-!17 = !DILocation(line: 6, column: 3, scope: !7)
-!18 = distinct !DISubprogram(name: "caller", linkageName: "caller", scope: !1, file: !1, line: 4, type: !9, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!19 = !DILocation(line: 6, column: 10, scope: !7, inlinedAt: !20)
-!20 = distinct !DILocation(line: 6, column: 10, scope: !18)
-!21 = !DILocation(line: 6, column: 22, scope: !7, inlinedAt: !20)
-!22 = !DILocation(line: 6, column: 20, scope: !7, inlinedAt: !20)
-!23 = !DILocation(line: 6, column: 3, scope: !18)
-
-; CHECK-LABEL: caller:
-; CHECK-LABEL: # %bb.0:
-; CHECK-NEXT: .loc 1 6 22 prologue_end
-; CHECK-NEXT: prefetchnta 23464(%rdi)
-; CHECK-NEXT: movl 8(%rdi), %eax
-; CHECK-NEXT: .loc 1 6 20 is_stmt 0 discriminator 2
-; CHECK-NEXT: prefetchnta 8764(%rdi)
-; CHECK-NEXT: prefetchnta 64(%rdi)
-; CHECK-NEXT: addl (%rdi), %eax
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
deleted file mode 100644
index 6385a498b8f92..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
+++ /dev/null
@@ -1,2 +0,0 @@
-main:0:0
- 6: 0 __prefetch_nta_0:42
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
deleted file mode 100644
index f8e25028cfdee..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-invalid-instr.afdo | FileCheck %s
-; ModuleID = 'prefetch.cc'
-source_filename = "prefetch.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
-entry:
-  tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9
-  ret i32 291, !dbg !11
-}
-
-; Function Attrs: inaccessiblemem_or_argmemonly nounwind
-declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1
-
-attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"}
-attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
-attributes #2 = { argmemonly nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "prefetch.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 327078) (llvm/trunk 327086)"}
-!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 12, column: 3, scope: !7)
-!10 = !DILocation(line: 14, column: 3, scope: !7)
-!11 = !DILocation(line: 15, column: 3, scope: !7)
-
-;CHECK-LABEL: main:
-;CHECK:       # %bb.0:
-;CHECK:       prefetchnta 291
-;CHECK-NOT:   prefetchnta 42(%rax,%ymm0)
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo b/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
deleted file mode 100644
index 783da34f7f84c..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_t0_1:0 __prefetch_t2_0:42
- 1.1: 0 __prefetch_t1_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.afdo b/llvm/test/CodeGen/X86/insert-prefetch.afdo
deleted file mode 100644
index 96487e85eaaf2..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_nta_1:0 __prefetch_nta_0:42
- 1.1: 0 __prefetch_nta_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.ll b/llvm/test/CodeGen/X86/insert-prefetch.ll
deleted file mode 100644
index 971a6193862d0..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch.afdo | FileCheck %s
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-other.afdo | FileCheck %s -check-prefix=OTHERS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; NOTE: debug line numbers were adjusted such that the function would start
-; at line 15 (an arbitrary number). The sample profile file format uses
-; offsets from the start of the symbol instead of file-relative line numbers.
-; The .afdo file reflects that - the instructions are offset '1'.
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !35 !prof !37 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !38
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !38
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !38, !tbaa !39
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !43
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !43
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !43, !tbaa !39
-  %add = add nsw i32 %1, %0, !dbg !44
-  ret i32 %add, !dbg !45
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!33}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{i32 1, !"ProfileSummary", !7}
-!7 = !{!8, !9, !10, !11, !12, !13, !14, !15}
-!8 = !{!"ProfileFormat", !"SampleProfile"}
-!9 = !{!"TotalCount", i64 0}
-!10 = !{!"MaxCount", i64 0}
-!11 = !{!"MaxInternalCount", i64 0}
-!12 = !{!"MaxFunctionCount", i64 0}
-!13 = !{!"NumCounts", i64 2}
-!14 = !{!"NumFunctions", i64 1}
-!15 = !{!"DetailedSummary", !16}
-!16 = !{!17, !18, !19, !20, !21, !22, !22, !23, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
-!17 = !{i32 10000, i64 0, i32 0}
-!18 = !{i32 100000, i64 0, i32 0}
-!19 = !{i32 200000, i64 0, i32 0}
-!20 = !{i32 300000, i64 0, i32 0}
-!21 = !{i32 400000, i64 0, i32 0}
-!22 = !{i32 500000, i64 0, i32 0}
-!23 = !{i32 600000, i64 0, i32 0}
-!24 = !{i32 700000, i64 0, i32 0}
-!25 = !{i32 800000, i64 0, i32 0}
-!26 = !{i32 900000, i64 0, i32 0}
-!27 = !{i32 950000, i64 0, i32 0}
-!28 = !{i32 990000, i64 0, i32 0}
-!29 = !{i32 999000, i64 0, i32 0}
-!30 = !{i32 999900, i64 0, i32 0}
-!31 = !{i32 999990, i64 0, i32 0}
-!32 = !{i32 999999, i64 0, i32 0}
-!33 = !{!"clang version 7.0.0 (trunk 322593) (llvm/trunk 322526)"}
-!35 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 15, type: !36, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!36 = !DISubroutineType(types: !2)
-!37 = !{!"function_entry_count", i64 -1}
-!38 = !DILocation(line: 16, column: 10, scope: !35)
-!39 = !{!40, !40, i64 0}
-!40 = !{!"int", !41, i64 0}
-!41 = !{!"omnipotent char", !42, i64 0}
-!42 = !{!"Simple C++ TBAA"}
-!43 = !DILocation(line: 16, column: 22, scope: !35)
-!44 = !DILocation(line: 16, column: 20, scope: !35)
-!45 = !DILocation(line: 16, column: 3, scope: !35)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       prefetchnta 42(%rdi,%rax,4)
-;CHECK-NEXT:  prefetchnta (%rdi,%rax,4)
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
-;CHECK-NEXT:  prefetchnta -1(%rdi,%rcx,4)
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 16 3                   # test.cc:16:3
-
-;OTHERS-LABEL: sum:
-;OTHERS:       # %bb.0:
-;OTHERS:       prefetcht2 42(%rdi,%rax,4)
-;OTHERS-NEXT:  prefetcht0 (%rdi,%rax,4)
-;OTHERS-NEXT:  movl (%rdi,%rax,4), %eax
-;OTHERS-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
-;OTHERS-NEXT:  prefetcht1 -1(%rdi,%rcx,4)
-;OTHERS-NEXT:  addl (%rdi,%rcx,4), %eax
-;OTHERS-NEXT:  .loc 1 16 3                   # test.cc:16:3
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 81390e59d0d0a..276232e27c000 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -208,8 +208,6 @@
 ; CHECK-NEXT:       X86 Fixup Inst Tuning
 ; CHECK-NEXT:       X86 Fixup Vector Constants
 ; CHECK-NEXT:       Compressing EVEX instrs when possible
-; CHECK-NEXT:       X86 Discriminate Memory Operands
-; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       Remove Loads Into Fake Uses
diff --git a/llvm/test/MC/AsmParser/directive_base64.s b/llvm/test/MC/AsmParser/directive_base64.s
new file mode 100644
index 0000000000000..46a477eef51dc
--- /dev/null
+++ b/llvm/test/MC/AsmParser/directive_base64.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-unknown-unknown -defsym=ERR=1 -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+        .data
+# CHECK-LABEL: TEST0:
+# CHECK-NEXT: .byte 0
+TEST0:
+        .base64 "AA=="
+
+# CHECK-LABEL: TEST1:
+# CHECK-NEXT: .ascii "abcxyz"
+TEST1:
+        .base64 "YWJjeHl6"
+
+# CHECK-LABEL: TEST2:
+# CHECK-NEXT: .byte 1
+# CHECK-NEXT: .byte 2
+TEST2:
+        .base64 "AQ=="
+        .base64 "Ag=="
+
+# CHECK-LABEL: TEST3:
+# CHECK-NEXT: .byte 1
+# CHECK-NEXT: .byte 2
+TEST3:
+        .base64 "AQ==", "Ag=="
+
+.ifdef ERR
+# CHECK-ERROR: [[#@LINE+1]]:17: error: expected string
+        .base64 not-a-string
+
+# CHECK-ERROR: [[#@LINE+1]]:17: error: failed to base64 decode string data
+        .base64 "AA"
+
+# CHECK-ERROR: [[#@LINE+1]]:17: error: expected nonempty string
+        .base64 ""
+.endif
diff --git a/llvm/test/TableGen/intrinsic-arginfo.td b/llvm/test/TableGen/intrinsic-arginfo.td
new file mode 100644
index 0000000000000..eab1f5e032bc3
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-arginfo.td
@@ -0,0 +1,71 @@
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s | FileCheck %s
+
+// Test ArgInfo property for pretty-printing intrinsic arguments.
+// This test verifies that TableGen generates the correct pretty-printing code
+// for intrinsics that use the ArgInfo property.
+
+include "llvm/IR/Intrinsics.td"
+
+// Simple intrinsic with two arguments that have ArgInfo.
+def int_dummy_foo_bar : DefaultAttrsIntrinsic<
+    [llvm_i32_ty],
+    [llvm_i32_ty,      // data
+     llvm_i32_ty,      // mode
+     llvm_i32_ty],     // stride
+    [IntrNoMem,
+     ImmArg<ArgIndex<1>>,
+     ArgInfo<ArgIndex<1>, [ArgName<"mode">, ImmArgPrinter<"printDummyMode">]>,
+     ArgInfo<ArgIndex<2>, [ArgName<"stride">]>]>;
+
+// A custom floating point add with rounding and sat mode.
+def int_my_fadd_f32 : DefaultAttrsIntrinsic<
+    [llvm_float_ty],
+    [llvm_float_ty,    // a
+     llvm_float_ty,    // b
+     llvm_i32_ty,      // rounding_mode
+     llvm_i1_ty],      // saturation_mode
+    [IntrNoMem,
+     ImmArg<ArgIndex<2>>,
+     ImmArg<ArgIndex<3>>,
+     ArgInfo<ArgIndex<2>, [ArgName<"rounding_mode">, ImmArgPrinter<"printRoundingMode">]>,
+     ArgInfo<ArgIndex<3>, [ArgName<"saturation_mode">]>]>;
+
+// CHECK: #ifdef GET_INTRINSIC_PRETTY_PRINT_TABLE
+// CHECK-NEXT: static constexpr uint8_t PPTable[] = {
+
+// CHECK: #endif // GET_INTRINSIC_PRETTY_PRINT_TABLE
+
+// CHECK: #ifdef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+// CHECK: void Intrinsic::printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS, const Constant *ImmArgVal) {
+
+// CHECK: case dummy_foo_bar:
+// CHECK-NEXT: switch (ArgIdx) {
+
+// CHECK-NEXT: case 1:
+// CHECK-NEXT: OS << "mode=";
+// CHECK-NEXT: printDummyMode(OS, ImmArgVal);
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: case 2:
+// CHECK-NEXT: OS << "stride=";
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: }
+// CHECK-NEXT: break;
+
+// CHECK: case my_fadd_f32:
+// CHECK-NEXT: switch (ArgIdx) {
+
+// CHECK-NEXT: case 2:
+// CHECK-NEXT: OS << "rounding_mode=";
+// CHECK-NEXT: printRoundingMode(OS, ImmArgVal);
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: case 3:
+// CHECK-NEXT: OS << "saturation_mode=";
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: }
+// CHECK-NEXT: break;
+
+// CHECK: #endif // GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll
new file mode 100644
index 0000000000000..afed80fc562c1
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll
@@ -0,0 +1,129 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; Need context sizes in summary, so enable reporting.
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
+
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Allow default detection of the largest cold contexts, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: created clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index fe25d1b231efc..ed027e8b9a895 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -497,6 +497,129 @@ exit:
   ret void
 }
 
+define void @noalias_metadata(ptr align 8 %dst, ptr align 8 %src) {
+; CHECK-LABEL: define void @noalias_metadata(
+; CHECK-SAME: ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST3]], [[SRC4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[DST1]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], [[SRC2]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP23]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[NEXT_GEP]], align 8, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store ptr [[TMP7]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load ptr, ptr [[PTR]], align 8
+; CHECK-NEXT:    store ptr [[VAL]], ptr [[DST]], align 8, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR]], [[DST]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; INTERLEAVE-LABEL: define void @noalias_metadata(
+; INTERLEAVE-SAME: ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64
+; INTERLEAVE-NEXT:    [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64
+; INTERLEAVE-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; INTERLEAVE-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = sub i64 [[DST3]], [[SRC4]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; INTERLEAVE:       [[VECTOR_MEMCHECK]]:
+; INTERLEAVE-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i64 [[DST1]], 8
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], [[SRC2]]
+; INTERLEAVE-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
+; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]]
+; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; INTERLEAVE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; INTERLEAVE-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 8
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP23]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = mul i64 [[INDEX]], 8
+; INTERLEAVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 2
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8, !alias.scope [[META14:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i32 1
+; INTERLEAVE-NEXT:    store ptr [[TMP8]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; INTERLEAVE-NEXT:    [[VAL:%.*]] = load ptr, ptr [[PTR]], align 8
+; INTERLEAVE-NEXT:    store ptr [[VAL]], ptr [[DST]], align 8, !noalias [[META23:![0-9]+]]
+; INTERLEAVE-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR]], [[DST]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr = phi ptr [ %ptr.next, %loop ], [ %src, %entry ]
+  %val = load ptr, ptr %ptr, align 8
+  store ptr %val, ptr %dst, align 8, !noalias !4
+  %ptr.next = getelementptr inbounds i8, ptr %ptr, i64 8
+  %cmp = icmp eq ptr %ptr, %dst
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 declare i64 @foo(i64)
 declare double @bar(double)
 
@@ -510,6 +633,9 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 !1 = !{ i64 0, i64 2 }
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"omnipotent char", !2, i64 0}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"g1"}
+!6 = distinct !{!6, !"t2"}
 
 ;.
 ; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
@@ -526,6 +652,17 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
+; CHECK: [[META14]] = !{[[META15:![0-9]+]]}
+; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]}
+; CHECK: [[META16]] = distinct !{[[META16]], !"LVerDomain"}
+; CHECK: [[META17]] = !{[[META18:![0-9]+]]}
+; CHECK: [[META18]] = distinct !{[[META18]], [[META16]]}
+; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META15]]}
+; CHECK: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"}
+; CHECK: [[META21]] = distinct !{[[META21]], !"t2"}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]}
+; CHECK: [[META23]] = !{[[META20]]}
+; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]}
 ;.
 ; INTERLEAVE: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
 ; INTERLEAVE: [[META1]] = !{!"omnipotent char", [[META2]]}
@@ -541,4 +678,15 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; INTERLEAVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
+; INTERLEAVE: [[META14]] = !{[[META15:![0-9]+]]}
+; INTERLEAVE: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]}
+; INTERLEAVE: [[META16]] = distinct !{[[META16]], !"LVerDomain"}
+; INTERLEAVE: [[META17]] = !{[[META18:![0-9]+]]}
+; INTERLEAVE: [[META18]] = distinct !{[[META18]], [[META16]]}
+; INTERLEAVE: [[META19]] = !{[[META20:![0-9]+]], [[META15]]}
+; INTERLEAVE: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"}
+; INTERLEAVE: [[META21]] = distinct !{[[META21]], !"t2"}
+; INTERLEAVE: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]}
+; INTERLEAVE: [[META23]] = !{[[META20]]}
+; INTERLEAVE: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]}
 ;.
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
new file mode 100644
index 0000000000000..a08f89b5bbe97
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
@@ -0,0 +1,105 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Allow default detection of the largest cold contexts, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: created clone DB.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}
diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td
index dc7a5b445cffe..706d9e0182f58 100644
--- a/llvm/tools/llvm-objdump/OtoolOpts.td
+++ b/llvm/tools/llvm-objdump/OtoolOpts.td
@@ -14,7 +14,7 @@ def G : Flag<["-"], "G">, HelpText<"print data-in-code table">;
 def h : Flag<["-"], "h">, HelpText<"print mach header">;
 def I : Flag<["-"], "I">, HelpText<"print indirect symbol table">;
 def j : Flag<["-"], "j">, HelpText<"print opcode bytes">;
-def l : Flag<["-"], "l">, HelpText<"print load commnads">;
+def l : Flag<["-"], "l">, HelpText<"print load commands">;
 def L : Flag<["-"], "L">, HelpText<"print used shared libraries">;
 def mcpu_EQ : Joined<["-"], "mcpu=">, HelpText<"select cpu for disassembly">;
 def o : Flag<["-"], "o">, HelpText<"print Objective-C segment">;
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 4d07462babefa..80d10138d7bfe 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -49,7 +49,6 @@ add_llvm_unittest(CodeGenTests
   TypeTraitsTest.cpp
   TargetOptionsTest.cpp
   TestAsmPrinter.cpp
-  MLRegAllocDevelopmentFeatures.cpp
   X86MCInstLowerTest.cpp
   )
 
diff --git a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp b/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
deleted file mode 100644
index 00c2c3abf8533..0000000000000
--- a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- MLRegAllocDevelopmentFeatures.cpp - test dev MLRegAlloc features ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "../../lib/CodeGen/MLRegAllocEvictAdvisor.h"
-#include "llvm/Analysis/NoInferenceModelRunner.h"
-#include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/TargetParser/Triple.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using testing::ContainerEq;
-using testing::Test;
-
-namespace {
-
-#include "MFCommon.inc"
-
-struct LRPosInfoIndexes {
-  size_t StartIndex;
-  size_t EndIndex;
-  size_t PhysReg;
-};
-
-class RegAllocDevelopmentFeaturesTest : public ::Test {
-protected:
-  SmallVector<LRStartEndInfo>
-  setupOverlapProblem(const SmallVectorImpl<LRPosInfoIndexes> &Segments,
-                      simple_ilist<IndexListEntry> &IndexList) {
-    SmallVector<LRStartEndInfo> PositionsToReturn;
-    PositionsToReturn.reserve(Segments.size());
-    for (auto CurrentPosIndexInfo : Segments) {
-      LRStartEndInfo CurrentPosInfo = {};
-      CurrentPosInfo.Pos = CurrentPosIndexInfo.PhysReg;
-      PositionsToReturn.push_back(CurrentPosInfo);
-    }
-    size_t CurrentSegmentIndex = 0;
-    size_t CurrentIndex = 0;
-    while (CurrentSegmentIndex < Segments.size()) {
-      auto *CurrentLEMem = static_cast<IndexListEntry *>(
-          Allocator.Allocate(sizeof(IndexListEntry), alignof(IndexListEntry)));
-      auto *CurrentListEntry =
-          new (CurrentLEMem) IndexListEntry(nullptr, CurrentIndex);
-      IndexList.push_back(*CurrentListEntry);
-      for (size_t CurrentPosInfoIndex = 0;
-           CurrentPosInfoIndex < Segments.size(); ++CurrentPosInfoIndex) {
-        if ((CurrentIndex / SlotIndex::InstrDist) ==
-            Segments[CurrentPosInfoIndex].StartIndex) {
-          PositionsToReturn[CurrentPosInfoIndex].Begin =
-              SlotIndex(CurrentListEntry, 0);
-        } else if ((CurrentIndex / SlotIndex::InstrDist) ==
-                   Segments[CurrentPosInfoIndex].EndIndex) {
-          PositionsToReturn[CurrentPosInfoIndex].End =
-              SlotIndex(CurrentListEntry, 0);
-          ++CurrentSegmentIndex;
-        }
-      }
-      CurrentIndex += SlotIndex::InstrDist;
-    }
-    return PositionsToReturn;
-  }
-
-  NoInferenceModelRunner setupModelRunner() {
-    const std::vector<TensorSpec> Inputs{
-        TensorSpec::createSpec<int64_t>("instructions", InstructionsShape),
-        TensorSpec::createSpec<int64_t>("instructions_mapping",
-                                        InstructionsMappingShape),
-        TensorSpec::createSpec<float>("mbb_frequencies", MBBFrequencyShape),
-        TensorSpec::createSpec<int64_t>("mbb_mapping", InstructionsShape)};
-    LLVMContext Ctx;
-    return NoInferenceModelRunner(Ctx, Inputs);
-  }
-
-  std::vector<int64_t>
-  getExpectedMappingMatrix(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    std::vector<int64_t> ExpectedMappingMatrix(
-        NumberOfInterferences * ModelMaxSupportedInstructionCount, 0);
-    for (auto NewSegment : OverlapSetup) {
-      for (size_t CurrentIndex = NewSegment.StartIndex;
-           CurrentIndex <= NewSegment.EndIndex; ++CurrentIndex) {
-        ExpectedMappingMatrix[NewSegment.PhysReg *
-                                  ModelMaxSupportedInstructionCount +
-                              CurrentIndex] = 1;
-      }
-    }
-    return ExpectedMappingMatrix;
-  }
-
-  void runOverlapTest(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    simple_ilist<IndexListEntry> IndexList;
-    auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-    NoInferenceModelRunner ModelRunner = setupModelRunner();
-    size_t MaxIndex = 0;
-    for (size_t CurrentOverlap = 0; CurrentOverlap < OverlapSetup.size();
-         ++CurrentOverlap) {
-      if (OverlapSetup[CurrentOverlap].EndIndex >
-          OverlapSetup[MaxIndex].EndIndex) {
-        MaxIndex = CurrentOverlap;
-      }
-    }
-    SlotIndex LastIndex = OverlapProblem[MaxIndex].End;
-    extractInstructionFeatures(
-        OverlapProblem, &ModelRunner,
-        [](SlotIndex InputSlot) -> int { return 0; },
-        [](SlotIndex InputSlot) -> float { return 0.0f; },
-        [](SlotIndex InputSlot) -> MachineBasicBlock * { return nullptr; }, 0,
-        1, 2, 3, LastIndex);
-    std::vector<int64_t> MappingMatrix(
-        ModelRunner.getTensor<int64_t>(1),
-        ModelRunner.getTensor<int64_t>(1) +
-            NumberOfInterferences * ModelMaxSupportedInstructionCount);
-    ASSERT_THAT(MappingMatrix,
-                ContainerEq(getExpectedMappingMatrix(OverlapSetup)));
-    IndexList.clear();
-  }
-
-  BumpPtrAllocator Allocator;
-};
-
-// meta tests to ensure that test setup works correctly
-
-TEST_F(RegAllocDevelopmentFeaturesTest,
-       MetaOverlapInstructionDistancesAreCorrect) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 5, 0});
-  OverlapSetup.push_back({5, 10, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].End),
-            5 * SlotIndex::InstrDist);
-  ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].Begin), 0);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, MetaSlotIndicesAreValid) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, 10, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  ASSERT_TRUE(OverlapProblem[0].Begin.isValid());
-  ASSERT_TRUE(OverlapProblem[0].End.isValid());
-}
-
-// Testing of feature extraction for per-instruction features
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InstructionOpcodesAreCorrect) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex LastIndex = OverlapProblem[0].End;
-  SlotIndex FirstIndex = OverlapProblem[0].Begin;
-  extractInstructionFeatures(
-      OverlapProblem, &ModelRunner,
-      [FirstIndex](SlotIndex InputSlot) -> int {
-        return FirstIndex.distance(InputSlot) / SlotIndex::InstrDist;
-      },
-      [](SlotIndex InputSlot) -> float { return 0.0f; },
-      [](SlotIndex InputSlot) -> MachineBasicBlock * { return nullptr; }, 0, 1,
-      2, 3, LastIndex);
-  for (size_t CurrentInstructionIndex = 0;
-       CurrentInstructionIndex < ModelMaxSupportedInstructionCount;
-       ++CurrentInstructionIndex) {
-    ASSERT_EQ(
-        (size_t)ModelRunner.getTensor<int64_t>(0)[CurrentInstructionIndex],
-        CurrentInstructionIndex);
-  }
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, FullOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, PartialOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 20, 0});
-  OverlapSetup.push_back({15, 30, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, PartialOverlapOpposite) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({15, 30, 1});
-  OverlapSetup.push_back({0, 20, 0});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InternalOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 30, 0});
-  OverlapSetup.push_back({10, 20, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, TripleInternalOverlap) {
-  SmallVector<LRPosInfoIndexes, 3> OverlapSetup;
-  OverlapSetup.push_back({0, 30, 0});
-  OverlapSetup.push_back({10, 25, 1});
-  OverlapSetup.push_back({15, 20, 2});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InternalMultiOverlap) {
-  SmallVector<LRPosInfoIndexes, 3> OverlapSetup;
-  OverlapSetup.push_back({0, 45, 0});
-  OverlapSetup.push_back({30, 40, 1});
-  OverlapSetup.push_back({35, 60, 2});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, SingleMBBTest) {
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex CurrentIndex;
-  // set index to 1 so we can ensure that the mapping actually get set
-  std::map<MachineBasicBlock *, size_t> VisitedMBBs = {{nullptr, 1}};
-  extractMBBFrequency(
-      CurrentIndex, 0, VisitedMBBs,
-      [](SlotIndex InputSlot) -> float { return 1.0f; }, nullptr, &ModelRunner,
-      2, 3);
-  ASSERT_FLOAT_EQ(ModelRunner.getTensor<float>(2)[1], 1.0f);
-  ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[0], 1);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, MBBFullTruncated) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex LastIndex = OverlapProblem[0].End;
-  SlotIndex FirstIndex = OverlapProblem[0].Begin;
-
-  LLVMContext Ctx;
-  Module Mod("Module", Ctx);
-  auto MF = createMachineFunction(Ctx, Mod);
-  std::array<MachineBasicBlock *, ModelMaxSupportedInstructionCount>
-      MBBsForTest;
-  for (size_t I = 0; I < ModelMaxSupportedInstructionCount; ++I) {
-    MBBsForTest[I] = MF->CreateMachineBasicBlock();
-  }
-
-  extractInstructionFeatures(
-      OverlapProblem, &ModelRunner,
-      [](SlotIndex InputSlot) -> int { return 0; },
-      [FirstIndex](SlotIndex InputSlot) -> float {
-        return static_cast<float>(FirstIndex.distance(InputSlot) /
-                                  SlotIndex::InstrDist);
-      },
-      [FirstIndex, MBBsForTest](SlotIndex InputSlot) -> MachineBasicBlock * {
-        return MBBsForTest[FirstIndex.distance(InputSlot) /
-                           SlotIndex::InstrDist];
-      },
-      0, 1, 2, 3, LastIndex);
-  for (size_t MBBIndex = 0; MBBIndex < ModelMaxSupportedMBBCount; ++MBBIndex) {
-    ASSERT_FLOAT_EQ(ModelRunner.getTensor<float>(2)[MBBIndex],
-                    static_cast<float>(MBBIndex));
-    ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[MBBIndex],
-              static_cast<int64_t>(MBBIndex));
-  }
-  // the rest of the mapping values should be zero (truncated to 100 MBBs)
-  for (size_t MBBIndex = ModelMaxSupportedMBBCount;
-       MBBIndex < ModelMaxSupportedInstructionCount; ++MBBIndex) {
-    ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[MBBIndex],
-              static_cast<int64_t>(0));
-  }
-}
-
-} // end namespace
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index ee7fa175ca918..0e76c64f09f59 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1009,7 +1009,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   SmallVector<VPValue *, 2> Args;
   Args.push_back(Op1);
   Args.push_back(Op2);
-  VPWidenRecipe WidenR(*AI, make_range(Args.begin(), Args.end()));
+  VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc());
 
   checkVPRecipeCastImpl<VPWidenRecipe, VPUser>(&WidenR);
   delete AI;
@@ -1092,7 +1092,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
   IntegerType *Int64 = IntegerType::get(C, 64);
   auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
-  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast);
+  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {});
 
   checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser>(&Recipe);
   delete Cast;
@@ -1263,7 +1263,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     SmallVector<VPValue *, 2> Args;
     Args.push_back(Op1);
     Args.push_back(Op2);
-    VPWidenRecipe Recipe(*AI, make_range(Args.begin(), Args.end()));
+    VPWidenRecipe Recipe(*AI, Args, VPIRMetadata(), DebugLoc());
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1468,7 +1468,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   Args.push_back(ExtVPV1);
   Args.push_back(ExtVPV2);
   VPWidenRecipe *WidenR =
-      new VPWidenRecipe(*AI, make_range(Args.begin(), Args.end()));
+      new VPWidenRecipe(*AI, Args, VPIRMetadata(), DebugLoc());
   VPBB1->appendRecipe(WidenR);
 
   {
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index ff894853b9771..228969ab37f85 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -449,6 +449,29 @@ void CodeGenIntrinsic::setProperty(const Record *R) {
     int64_t Lower = R->getValueAsInt("Lower");
     int64_t Upper = R->getValueAsInt("Upper");
     addArgAttribute(ArgNo, Range, Lower, Upper);
+  } else if (R->isSubClassOf("ArgInfo")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    if (ArgNo < 1)
+      PrintFatalError(R->getLoc(),
+                      "ArgInfo requires ArgNo >= 1 (0 is return value)");
+    const ListInit *Properties = R->getValueAsListInit("Properties");
+    StringRef ArgName;
+    StringRef FuncName;
+
+    for (const Init *PropInit : Properties->getElements()) {
+      if (const auto *PropDef = dyn_cast<DefInit>(PropInit)) {
+        const Record *PropRec = PropDef->getDef();
+
+        if (PropRec->isSubClassOf("ArgName"))
+          ArgName = PropRec->getValueAsString("Name");
+        else if (PropRec->isSubClassOf("ImmArgPrinter"))
+          FuncName = PropRec->getValueAsString("FuncName");
+        else
+          PrintFatalError(PropRec->getLoc(),
+                          "Unknown ArgProperty type: " + PropRec->getName());
+      }
+    }
+    addPrettyPrintFunction(ArgNo - 1, ArgName, FuncName);
   } else {
     llvm_unreachable("Unknown property!");
   }
@@ -476,3 +499,16 @@ void CodeGenIntrinsic::addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V,
     ArgumentAttributes.resize(Idx + 1);
   ArgumentAttributes[Idx].emplace_back(AK, V, V2);
 }
+
+void CodeGenIntrinsic::addPrettyPrintFunction(unsigned ArgIdx,
+                                              StringRef ArgName,
+                                              StringRef FuncName) {
+  auto It = llvm::find_if(PrettyPrintFunctions, [ArgIdx](const auto &Info) {
+    return Info.ArgIdx == ArgIdx;
+  });
+  if (It != PrettyPrintFunctions.end())
+    PrintFatalError(TheDef->getLoc(), "ArgInfo for argument " + Twine(ArgIdx) +
+                                          " is already defined as '" +
+                                          It->FuncName + "'");
+  PrettyPrintFunctions.emplace_back(ArgIdx, ArgName, FuncName);
+}
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 15e803c4feba1..6ac6f734326d8 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -152,6 +152,22 @@ struct CodeGenIntrinsic {
   void addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V = 0,
                        uint64_t V2 = 0);
 
+  /// Structure to store pretty print and argument information.
+  struct PrettyPrintArgInfo {
+    unsigned ArgIdx;
+    StringRef ArgName;
+    StringRef FuncName;
+
+    PrettyPrintArgInfo(unsigned Idx, StringRef Name, StringRef Func)
+        : ArgIdx(Idx), ArgName(Name), FuncName(Func) {}
+  };
+
+  /// Vector that stores ArgInfo (ArgIndex, ArgName, FunctionName).
+  SmallVector<PrettyPrintArgInfo> PrettyPrintFunctions;
+
+  void addPrettyPrintFunction(unsigned ArgIdx, StringRef ArgName,
+                              StringRef FuncName);
+
   bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
 
   /// Goes through all IntrProperties that have IsDefault value set and sets
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 452d2b08f25c3..3ac23185ef91c 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -60,8 +60,16 @@ class IntrinsicEmitter {
                                 raw_ostream &OS);
   void EmitIntrinsicToOverloadTable(const CodeGenIntrinsicTable &Ints,
                                     raw_ostream &OS);
+  void EmitIntrinsicToPrettyPrintTable(const CodeGenIntrinsicTable &Ints,
+                                       raw_ostream &OS);
+  void EmitIntrinsicBitTable(
+      const CodeGenIntrinsicTable &Ints, raw_ostream &OS, StringRef Guard,
+      StringRef TableName, StringRef Comment,
+      function_ref<bool(const CodeGenIntrinsic &Int)> GetProperty);
   void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
   void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
+  void EmitPrettyPrintArguments(const CodeGenIntrinsicTable &Ints,
+                                raw_ostream &OS);
   void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints,
                                  bool IsClang, raw_ostream &OS);
 };
@@ -109,6 +117,12 @@ void IntrinsicEmitter::run(raw_ostream &OS, bool Enums) {
     // Emit the intrinsic parameter attributes.
     EmitAttributes(Ints, OS);
 
+    // Emit the intrinsic ID -> pretty print table.
+    EmitIntrinsicToPrettyPrintTable(Ints, OS);
+
+    // Emit Pretty Print attribute.
+    EmitPrettyPrintArguments(Ints, OS);
+
     // Emit code to translate Clang builtins into LLVM intrinsics.
     EmitIntrinsicToBuiltinMap(Ints, true, OS);
 
@@ -240,6 +254,29 @@ static constexpr IntrinsicTargetInfo TargetInfos[] = {
 )";
 }
 
+/// Helper function to emit a bit table for intrinsic properties.
+/// This is used for both overload and pretty print bit tables.
+void IntrinsicEmitter::EmitIntrinsicBitTable(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS, StringRef Guard,
+    StringRef TableName, StringRef Comment,
+    function_ref<bool(const CodeGenIntrinsic &Int)> GetProperty) {
+  OS << formatv("// {}\n", Comment);
+  OS << formatv("#ifdef {}\n", Guard);
+  OS << formatv("static constexpr uint8_t {}[] = {{\n", TableName);
+  OS << "  0\n  ";
+  for (auto [I, Int] : enumerate(Ints)) {
+    // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
+    size_t Idx = I + 1;
+    if (Idx % 8 == 0)
+      OS << ",\n  0";
+    if (GetProperty(Int))
+      OS << " | (1<<" << Idx % 8 << ')';
+  }
+  OS << "\n};\n\n";
+  OS << formatv("return ({}[id/8] & (1 << (id%8))) != 0;\n", TableName);
+  OS << formatv("#endif // {}\n\n", Guard);
+}
+
 void IntrinsicEmitter::EmitIntrinsicToNameTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
   // Built up a table of the intrinsic names.
@@ -276,24 +313,10 @@ static constexpr unsigned IntrinsicNameOffsetTable[] = {
 
 void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
-  OS << R"(// Intrinsic ID to overload bitset.
-#ifdef GET_INTRINSIC_OVERLOAD_TABLE
-static constexpr uint8_t OTable[] = {
-  0
-  )";
-  for (auto [I, Int] : enumerate(Ints)) {
-    // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
-    size_t Idx = I + 1;
-
-    if (Idx % 8 == 0)
-      OS << ",\n  0";
-    if (Int.isOverloaded)
-      OS << " | (1<<" << Idx % 8 << ')';
-  }
-  OS << "\n};\n\n";
-  // OTable contains a true bit at the position if the intrinsic is overloaded.
-  OS << "return (OTable[id/8] & (1 << (id%8))) != 0;\n";
-  OS << "#endif\n\n";
+  EmitIntrinsicBitTable(
+      Ints, OS, "GET_INTRINSIC_OVERLOAD_TABLE", "OTable",
+      "Intrinsic ID to overload bitset.",
+      [](const CodeGenIntrinsic &Int) { return Int.isOverloaded; });
 }
 
 using TypeSigTy = SmallVector<unsigned char>;
@@ -809,6 +832,52 @@ AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {{
                 NoFunctionAttrsID);
 }
 
+void IntrinsicEmitter::EmitIntrinsicToPrettyPrintTable(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
+  EmitIntrinsicBitTable(Ints, OS, "GET_INTRINSIC_PRETTY_PRINT_TABLE", "PPTable",
+                        "Intrinsic ID to pretty print bitset.",
+                        [](const CodeGenIntrinsic &Int) {
+                          return !Int.PrettyPrintFunctions.empty();
+                        });
+}
+
+void IntrinsicEmitter::EmitPrettyPrintArguments(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
+  OS << R"(
+#ifdef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+void Intrinsic::printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS, const Constant *ImmArgVal) {
+  using namespace Intrinsic;
+  switch (IID) {
+)";
+
+  for (const CodeGenIntrinsic &Int : Ints) {
+    if (Int.PrettyPrintFunctions.empty())
+      continue;
+
+    OS << "  case " << Int.EnumName << ":\n";
+    OS << "    switch (ArgIdx) {\n";
+    for (const auto [ArgIdx, ArgName, FuncName] : Int.PrettyPrintFunctions) {
+      OS << "    case " << ArgIdx << ":\n";
+      OS << "      OS << \"" << ArgName << "=\";\n";
+      if (!FuncName.empty()) {
+        OS << "      ";
+        if (!Int.TargetPrefix.empty())
+          OS << Int.TargetPrefix << "::";
+        OS << FuncName << "(OS, ImmArgVal);\n";
+      }
+      OS << "      return;\n";
+    }
+    OS << "    }\n";
+    OS << "    break;\n";
+  }
+  OS << R"(  default:
+    break;
+  }
+}
+#endif // GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+)";
+}
+
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
     const CodeGenIntrinsicTable &Ints, bool IsClang, raw_ostream &OS) {
   StringRef CompilerName = IsClang ? "Clang" : "MS";
diff --git a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
index 7ffc861331760..7020e24517d09 100644
--- a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
+++ b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
@@ -65,11 +65,8 @@ class AttrConvertFastMathToLLVM {
                         convertArithFastMathAttrToLLVM(arithFMFAttr));
     }
   }
-
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   NamedAttrList convertedAttr;
@@ -82,23 +79,36 @@ template <typename SourceOp, typename TargetOp>
 class AttrConvertOverflowToLLVM {
 public:
   AttrConvertOverflowToLLVM(SourceOp srcOp) {
+    using IntegerOverflowFlagsAttr = LLVM::IntegerOverflowFlagsAttr;
+
     // Copy the source attributes.
     convertedAttr = NamedAttrList{srcOp->getAttrs()};
     // Get the name of the arith overflow attribute.
     StringRef arithAttrName = SourceOp::getIntegerOverflowAttrName();
-    // Remove the source overflow attribute.
+    // Remove the source overflow attribute from the set that will be present
+    // in the target.
     if (auto arithAttr = dyn_cast_if_present<arith::IntegerOverflowFlagsAttr>(
             convertedAttr.erase(arithAttrName))) {
-      overflowFlags = convertArithOverflowFlagsToLLVM(arithAttr.getValue());
+      auto llvmFlag = convertArithOverflowFlagsToLLVM(arithAttr.getValue());
+      // Create a dictionary attribute holding the overflow flags property.
+      // (In the LLVM dialect, the overflow flags are a property, not an
+      // attribute.)
+      MLIRContext *ctx = srcOp.getOperation()->getContext();
+      Builder b(ctx);
+      auto llvmFlagAttr = IntegerOverflowFlagsAttr::get(ctx, llvmFlag);
+      StringRef llvmAttrName = TargetOp::getOverflowFlagsAttrName();
+      NamedAttribute attr{llvmAttrName, llvmFlagAttr};
+      // Set the properties attribute of the operation state so that the
+      // property can be updated when the operation is created.
+      propertiesAttr = b.getDictionaryAttr(ArrayRef(attr));
     }
   }
-
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const { return overflowFlags; }
+  Attribute getPropAttr() const { return propertiesAttr; }
 
 private:
   NamedAttrList convertedAttr;
-  LLVM::IntegerOverflowFlags overflowFlags = LLVM::IntegerOverflowFlags::none;
+  DictionaryAttr propertiesAttr;
 };
 
 template <typename SourceOp, typename TargetOp>
@@ -129,9 +139,7 @@ class AttrConverterConstrainedFPToLLVM {
   }
 
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   NamedAttrList convertedAttr;
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
index c292e3727f46c..f8e0ccc093f8b 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -19,16 +19,14 @@ class CallOpInterface;
 
 namespace LLVM {
 namespace detail {
-/// Handle generically setting flags as native properties on LLVM operations.
-void setNativeProperties(Operation *op, IntegerOverflowFlags overflowFlags);
-
 /// Replaces the given operation "op" with a new operation of type "targetOp"
 /// and given operands.
-LogicalResult oneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none);
+LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp,
+                              ValueRange operands,
+                              ArrayRef<NamedAttribute> targetAttrs,
+                              Attribute propertiesAttr,
+                              const LLVMTypeConverter &typeConverter,
+                              ConversionPatternRewriter &rewriter);
 
 /// Replaces the given operation "op" with a call to an LLVM intrinsic with the
 /// specified name "intrinsic" and operands.
@@ -307,9 +305,9 @@ class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(),
-                                         adaptor.getOperands(), op->getAttrs(),
-                                         *this->getTypeConverter(), rewriter);
+    return LLVM::detail::oneToOneRewrite(
+        op, TargetOp::getOperationName(), adaptor.getOperands(), op->getAttrs(),
+        /*propertiesAttr=*/Attribute{}, *this->getTypeConverter(), rewriter);
   }
 };
 
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index e7ab63abfeaa1..47b8381eefda8 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -54,25 +54,26 @@ LogicalResult handleMultidimensionalVectors(
     std::function<Value(Type, ValueRange)> createOperand,
     ConversionPatternRewriter &rewriter);
 
-LogicalResult vectorOneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none);
+LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp,
+                                    ValueRange operands,
+                                    ArrayRef<NamedAttribute> targetAttrs,
+                                    Attribute propertiesAttr,
+                                    const LLVMTypeConverter &typeConverter,
+                                    ConversionPatternRewriter &rewriter);
 } // namespace detail
 } // namespace LLVM
 
 // Default attribute conversion class, which passes all source attributes
-// through to the target op, unmodified.
+// through to the target op, unmodified. The attribute to set properties of the
+// target operation will be nullptr (i.e. any properties that exist in will have
+// default values).
 template <typename SourceOp, typename TargetOp>
 class AttrConvertPassThrough {
 public:
   AttrConvertPassThrough(SourceOp srcOp) : srcAttrs(srcOp->getAttrs()) {}
 
   ArrayRef<NamedAttribute> getAttrs() const { return srcAttrs; }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   ArrayRef<NamedAttribute> srcAttrs;
@@ -80,10 +81,13 @@ class AttrConvertPassThrough {
 
 /// Basic lowering implementation to rewrite Ops with just one result to the
 /// LLVM Dialect. This supports higher-dimensional vector types.
-/// The AttrConvert template template parameter should be a template class
-/// with SourceOp and TargetOp type parameters, a constructor that takes
-/// a SourceOp instance, and a getAttrs() method that returns
-/// ArrayRef<NamedAttribute>.
+/// The AttrConvert template template parameter should:
+//  - be a template class with SourceOp and TargetOp type parameters
+//  - have a constructor that takes a SourceOp instance
+//  - a getAttrs() method that returns ArrayRef<NamedAttribute> containing
+//    attributes that the target operation will have
+//  - a getPropAttr() method that returns either a NULL attribute or a
+//    DictionaryAttribute with properties that exist for the target operation
 template <typename SourceOp, typename TargetOp,
           template <typename, typename> typename AttrConvert =
               AttrConvertPassThrough,
@@ -137,8 +141,8 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
 
     return LLVM::detail::vectorOneToOneRewrite(
         op, TargetOp::getOperationName(), adaptor.getOperands(),
-        attrConvert.getAttrs(), *this->getTypeConverter(), rewriter,
-        attrConvert.getOverflowFlags());
+        attrConvert.getAttrs(), attrConvert.getPropAttr(),
+        *this->getTypeConverter(), rewriter);
   }
 };
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index a38cf41a3e09b..77d780425c3c3 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -158,6 +158,18 @@ class Arith_IntBinaryOpWithOverflowFlags<string mnemonic, list<Trait> traits = [
                           attr-dict `:` type($result) }];
 }
 
+class Arith_IntBinaryOpWithExactFlag<string mnemonic, list<Trait> traits = []> :
+    Arith_BinaryOp<mnemonic, traits #
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
+    Arguments<(ins SignlessIntegerOrIndexLike:$lhs,
+               SignlessIntegerOrIndexLike:$rhs,
+               UnitAttr:$isExact)>,
+    Results<(outs SignlessIntegerOrIndexLike:$result)> {
+
+  let assemblyFormat = [{ $lhs `,` $rhs (`exact` $isExact^)?
+                          attr-dict `:` type($result) }];
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 //===----------------------------------------------------------------------===//
@@ -482,7 +494,8 @@ def Arith_MulUIExtendedOp : Arith_Op<"mului_extended", [Pure, Commutative,
 // DivUIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
+def Arith_DivUIOp : Arith_IntBinaryOpWithExactFlag<"divui",
+                                                   [ConditionallySpeculatable]> {
   let summary = "unsigned integer division operation";
   let description = [{
     Unsigned integer division. Rounds towards zero. Treats the leading bit as
@@ -493,12 +506,18 @@ def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
     `tensor` values, the behavior is undefined if _any_ elements are divided by
     zero.
 
+    If the `exact` attribute is present, the result value is poison if `lhs` is
+    not a multiple of `rhs`.
+
     Example:
 
     ```mlir
     // Scalar unsigned integer division.
     %a = arith.divui %b, %c : i64
 
+    // Scalar unsigned integer division where %b is known to be a multiple of %c.
+    %a = arith.divui %b, %c exact : i64
+
     // SIMD vector element-wise division.
     %f = arith.divui %g, %h : vector<4xi32>
 
@@ -519,7 +538,8 @@ def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
 // DivSIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_DivSIOp : Arith_IntBinaryOp<"divsi", [ConditionallySpeculatable]> {
+def Arith_DivSIOp : Arith_IntBinaryOpWithExactFlag<"divsi",
+                                                   [ConditionallySpeculatable]> {
   let summary = "signed integer division operation";
   let description = [{
     Signed integer division. Rounds towards zero. Treats the leading bit as
@@ -530,12 +550,18 @@ def Arith_DivSIOp : Arith_IntBinaryOp<"divsi", [ConditionallySpeculatable]> {
     behavior is undefined if _any_ of its elements are divided by zero or has a
     signed division overflow.
 
+    If the `exact` attribute is present, the result value is poison if `lhs` is
+    not a multiple of `rhs`.
+
     Example:
 
     ```mlir
     // Scalar signed integer division.
     %a = arith.divsi %b, %c : i64
 
+    // Scalar signed integer division where %b is known to be a multiple of %c.
+    %a = arith.divsi %b, %c exact : i64
+
     // SIMD vector element-wise division.
     %f = arith.divsi %g, %h : vector<4xi32>
 
@@ -821,7 +847,7 @@ def Arith_ShLIOp : Arith_IntBinaryOpWithOverflowFlags<"shli"> {
 // ShRUIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
+def Arith_ShRUIOp : Arith_IntBinaryOpWithExactFlag<"shrui", [Pure]> {
   let summary = "unsigned integer right-shift";
   let description = [{
     The `shrui` operation shifts an integer value of the first operand to the right
@@ -830,12 +856,17 @@ def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
     filled with zeros. If the value of the second operand is greater or equal than the
     bitwidth of the first operand, then the operation returns poison.
 
+    If the `exact` attribute is present, the result value of shrui is a poison
+    value if any of the bits shifted out are non-zero.
+
     Example:
 
     ```mlir
-    %1 = arith.constant 160 : i8               // %1 is 0b10100000
+    %1 = arith.constant 160 : i8        // %1 is 0b10100000
     %2 = arith.constant 3 : i8
-    %3 = arith.shrui %1, %2 : (i8, i8) -> i8   // %3 is 0b00010100
+    %3 = arith.constant 6 : i8
+    %4 = arith.shrui %1, %2 exact : i8  // %4 is 0b00010100
+    %5 = arith.shrui %1, %3 : i8        // %3 is 0b00000010
     ```
   }];
   let hasFolder = 1;
@@ -845,7 +876,7 @@ def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
 // ShRSIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_ShRSIOp : Arith_TotalIntBinaryOp<"shrsi"> {
+def Arith_ShRSIOp : Arith_IntBinaryOpWithExactFlag<"shrsi", [Pure]> {
   let summary = "signed integer right-shift";
   let description = [{
     The `shrsi` operation shifts an integer value of the first operand to the right
@@ -856,14 +887,17 @@ def Arith_ShRSIOp : Arith_TotalIntBinaryOp<"shrsi"> {
     operand is greater or equal than bitwidth of the first operand, then the operation
     returns poison.
 
+    If the `exact` attribute is present, the result value of shrsi is a poison
+    value if any of the bits shifted out are non-zero.
+
     Example:
 
     ```mlir
-    %1 = arith.constant 160 : i8               // %1 is 0b10100000
+    %1 = arith.constant 160 : i8         // %1 is 0b10100000
     %2 = arith.constant 3 : i8
-    %3 = arith.shrsi %1, %2 : (i8, i8) -> i8   // %3 is 0b11110100
-    %4 = arith.constant 96 : i8                   // %4 is 0b01100000
-    %5 = arith.shrsi %4, %2 : (i8, i8) -> i8   // %5 is 0b00001100
+    %3 = arith.shrsi %1, %2 exact : i8   // %3 is 0b11110100
+    %4 = arith.constant 98 : i8          // %4 is 0b01100010
+    %5 = arith.shrsi %4, %2 : i8         // %5 is 0b00001100
     ```
   }];
   let hasFolder = 1;
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index e425e16a4b1a6..971710fa3ee13 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -39,7 +39,7 @@ class LLVM_TerminatorOp<string mnemonic, list<Trait> traits = []> :
 class LLVM_ArithmeticOpBase<Type type, string mnemonic,
                             string instName, list<Trait> traits = []> :
     LLVM_Op<mnemonic,
-           !listconcat([Pure, SameOperandsAndResultType], traits)>,
+           !listconcat([SameOperandsAndResultType, NoMemoryEffect], traits)>,
     LLVM_Builder<"$res = builder.Create" # instName # "($lhs, $rhs);"> {
   dag commonArgs = (ins LLVM_ScalarOrVectorOf<type>:$lhs,
                     LLVM_ScalarOrVectorOf<type>:$rhs);
@@ -116,7 +116,8 @@ class LLVM_IntArithmeticOpWithDisjointFlag<string mnemonic, string instName,
 class LLVM_FloatArithmeticOp<string mnemonic, string instName,
                              list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<LLVM_AnyFloat, mnemonic, instName,
-    !listconcat([DeclareOpInterfaceMethods<FastmathFlagsInterface>], traits)> {
+    !listconcat([DeclareOpInterfaceMethods<FastmathFlagsInterface>, Pure],
+                 traits)> {
   dag fmfArg = (
     ins DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags);
   let arguments = !con(commonArgs, fmfArg);
@@ -149,24 +150,26 @@ class LLVM_UnaryFloatArithmeticOp<Type type, string mnemonic,
 
 // Integer binary operations.
 def LLVM_AddOp : LLVM_IntArithmeticOpWithOverflowFlag<"add", "Add",
-    [Commutative]>;
-def LLVM_SubOp : LLVM_IntArithmeticOpWithOverflowFlag<"sub", "Sub", []>;
+    [Commutative, Pure]>;
+def LLVM_SubOp : LLVM_IntArithmeticOpWithOverflowFlag<"sub", "Sub", [Pure]>;
 def LLVM_MulOp : LLVM_IntArithmeticOpWithOverflowFlag<"mul", "Mul",
-    [Commutative]>;
-def LLVM_UDivOp : LLVM_IntArithmeticOpWithExactFlag<"udiv", "UDiv">;
-def LLVM_SDivOp : LLVM_IntArithmeticOpWithExactFlag<"sdiv", "SDiv">;
-def LLVM_URemOp : LLVM_IntArithmeticOp<"urem", "URem">;
-def LLVM_SRemOp : LLVM_IntArithmeticOp<"srem", "SRem">;
-def LLVM_AndOp : LLVM_IntArithmeticOp<"and", "And">;
-def LLVM_OrOp : LLVM_IntArithmeticOpWithDisjointFlag<"or", "Or"> {
+    [Commutative, Pure]>;
+def LLVM_UDivOp : LLVM_IntArithmeticOpWithExactFlag<"udiv", "UDiv",
+    [DeclareOpInterfaceMethods<ConditionallySpeculatable>]>;
+def LLVM_SDivOp : LLVM_IntArithmeticOpWithExactFlag<"sdiv", "SDiv",
+    [DeclareOpInterfaceMethods<ConditionallySpeculatable>]>;
+def LLVM_URemOp : LLVM_IntArithmeticOp<"urem", "URem", [Pure]>;
+def LLVM_SRemOp : LLVM_IntArithmeticOp<"srem", "SRem", [Pure]>;
+def LLVM_AndOp : LLVM_IntArithmeticOp<"and", "And", [Pure]>;
+def LLVM_OrOp : LLVM_IntArithmeticOpWithDisjointFlag<"or", "Or", [Pure]> {
   let hasFolder = 1;
 }
-def LLVM_XOrOp : LLVM_IntArithmeticOp<"xor", "Xor">;
-def LLVM_ShlOp : LLVM_IntArithmeticOpWithOverflowFlag<"shl", "Shl", []> {
+def LLVM_XOrOp : LLVM_IntArithmeticOp<"xor", "Xor", [Pure]>;
+def LLVM_ShlOp : LLVM_IntArithmeticOpWithOverflowFlag<"shl", "Shl", [Pure]> {
   let hasFolder = 1;
 }
-def LLVM_LShrOp : LLVM_IntArithmeticOpWithExactFlag<"lshr", "LShr">;
-def LLVM_AShrOp : LLVM_IntArithmeticOpWithExactFlag<"ashr", "AShr">;
+def LLVM_LShrOp : LLVM_IntArithmeticOpWithExactFlag<"lshr", "LShr", [Pure]>;
+def LLVM_AShrOp : LLVM_IntArithmeticOpWithExactFlag<"ashr", "AShr", [Pure]>;
 
 // Base class for compare operations. A compare operation takes two operands
 // of the same type and returns a boolean result. If the operands are
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d4ef5104d3c1f..6e3a92b5bde42 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -79,6 +79,40 @@ def NVVM_Dialect : Dialect {
     sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to
     embed PTX inline as a last-resort escape hatch, with explicit operands and
     results.
+
+
+    **Memory Spaces:** The NVVM dialect introduces the following memory spaces,
+    each with distinct scopes and lifetimes:
+```
+    | Memory Space      | Address Space | Scope                | Lifetime          |
+    |-------------------|---------------|----------------------|-------------------|
+    | `generic`         | 0             | All threads          | Context-dependent |
+    | `global`          | 1             | All threads (device) | Application       |
+    | `shared`          | 3             | Thread block (CTA)   | Kernel execution  |
+    | `constant`        | 4             | All threads (RO)     | Application       |
+    | `local`           | 5             | Single thread        | Kernel execution  |
+    | `tensor`          | 6             | Thread block (CTA)   | Kernel execution  |
+    | `shared_cluster`  | 7             | Thread block cluster | Kernel execution  |
+```
+    **Memory Space Details:**
+    - **generic**: Can point to any memory space; requires runtime resolution of
+      actual address space. Use when pointer origin is unknown at compile time.
+      Performance varies based on the underlying memory space.
+    - **global**: Accessible by all threads across all blocks; persists across
+      kernel launches. Highest latency but largest capacity (device memory). Best
+      for large data and inter-kernel communication.
+    - **shared**: Shared within a thread block (CTA); very fast on-chip memory for
+      cooperation between threads in the same block. Limited capacity. Ideal for 
+      block-level collaboration, caching, and reducing global memory traffic.
+    - **constant**: Read-only memory cached per SM. Size typically limited to 
+      64KB. Best for read-only data and uniform values accessed by all threads.
+    - **local**: Private to each thread. Use for per-thread private data and
+      automatic variables that don't fit in registers.
+    - **tensor**: Special memory space for tensor core operations. Used by
+      `tcgen05` instructions on SM 100+ for tensor input/output operations.
+    - **shared_cluster**: Distributed shared memory across thread blocks within
+      a cluster (SM 90+). Enables collaboration beyond single-block scope with
+      fast access across cluster threads.
   }];
 
   let name = "nvvm";
@@ -228,6 +262,33 @@ def NVVMMemorySpaceAttr :
   let assemblyFormat = "`<` $value `>`";
 }
 
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+// Attrs to disambiguate the cta or cluster space within shared memory
+def SharedSpaceCTA : I32EnumAttrCase<"shared_cta", 0, "cta">;
+def SharedSpaceCluster   : I32EnumAttrCase<"shared_cluster", 1, "cluster">;
+def SharedSpace : I32EnumAttr<"SharedSpace", "Shared memory space",
+  [SharedSpaceCTA, SharedSpaceCluster]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM intrinsic operations
 //===----------------------------------------------------------------------===//
@@ -1107,17 +1168,6 @@ def NVVM_FenceScClusterOp : NVVM_Op<"fence.sc.cluster"> {
   let assemblyFormat = "attr-dict";
 }
 
-def SharedSpaceCTA : I32EnumAttrCase<"shared_cta", 0, "cta">;
-def SharedSpaceCluster   : I32EnumAttrCase<"shared_cluster", 1, "cluster">;
-def SharedSpace : I32EnumAttr<"SharedSpace", "Shared memory space",
-  [SharedSpaceCTA, SharedSpaceCluster]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
 def ProxyAsync   : I32EnumAttrCase<"async", 1, "async">;
 def ProxyAsyncGlobal   : I32EnumAttrCase<"async_global", 2, "async.global">;
@@ -1158,21 +1208,6 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
   let hasVerifier = 1;
 }
 
-// Attrs describing the scope of the Memory Operation
-def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
-def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
-def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
-def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
-
-def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
-  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
       Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
                      DefaultValuedAttr<ProxyKindAttr,
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 40ccd1fc6c1a0..970d9304d8289 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -63,4 +63,42 @@ def ACCImplicitData : Pass<"acc-implicit-data", "mlir::ModuleOp"> {
   ];
 }
 
+def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> {
+  let summary = "Generate implicit acc routine for functions in acc regions";
+  let description = [{
+    This pass implements the implicit rules described in OpenACC specification
+    for `Routine Directive` (OpenACC 3.4 spec, section 2.15.1).
+
+    "If no explicit routine directive applies to a procedure whose definition
+    appears in the program unit being compiled, then the implementation applies
+    an implicit routine directive to that procedure if any of the following
+    conditions holds:
+    - The procedure is called or its address is accessed in a compute region."
+
+    The specification further states:
+    "When the implementation applies an implicit routine directive to a procedure,
+    it must recursively apply implicit routine directives to other procedures for
+    which the above rules specify relevant dependencies. Such dependencies can
+    form a cycle, so the implementation must take care to avoid infinite recursion."
+
+    This pass implements these requirements by:
+    1. Walking through all OpenACC compute constructs and functions already
+       marked with `acc routine` in the module and identifying function calls
+       within these regions.
+    2. Creating implicit `acc.routine` operations for functions that don't already
+       have routine declarations.
+    3. Recursively walking through all existing `acc routine` and creating
+       implicit routine operations for function calls within these routines,
+       while avoiding infinite recursion through proper tracking.
+  }];
+  let dependentDialects = ["mlir::acc::OpenACCDialect"];
+  let options = [
+    Option<"deviceType", "device-type", "mlir::acc::DeviceType",
+           "mlir::acc::DeviceType::None",
+           "Target device type for implicit routine generation. "
+           "Ensures that `acc routine` device_type clauses are "
+           "properly considered not just default clauses.">
+  ];
+}
+
 #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index af64370a62dd7..419ecda80e9a5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -58,9 +58,10 @@ enum class SparseEmitStrategy {
 namespace sparse_tensor {
 
 /// Defines a strategy for loop ordering during sparse code generation.
+/// See Passes.td for strategy descriptions.
 enum class LoopOrderingStrategy : unsigned {
-  kDefault, ///< Default strategy (eagerly selects last loop in topological
-            ///< sort).
+  kDefault,
+  kDenseOuter,
 };
 
 } // namespace sparse_tensor
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 75e77d67db1b3..0b8562e484f51 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -85,7 +85,9 @@ def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
        "mlir::sparse_tensor::LoopOrderingStrategy::kDefault",
        "Set the loop ordering strategy for sparse code generation", [{llvm::cl::values(
          clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDefault, "default",
-                    "Default strategy (eagerly selects last loop in topological sort)"))}]>,
+                    "Default strategy (eagerly selects last loop in topological sort)"),
+         clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDenseOuter, "dense-outer",
+                    "Prefer dense, then compressed, then singleton dimensions outermost"))}]>,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 689ebd0d1179a..4c67856b559b1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -844,7 +844,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<XeGPU_LayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -903,7 +903,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::LayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -988,7 +988,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<XeGPU_LayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1046,7 +1046,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::LayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a5831559558ac..edc6565f44f00 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -43,6 +43,7 @@ constexpr Chipset kGfx908 = Chipset(9, 0, 8);
 constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
 constexpr Chipset kGfx942 = Chipset(9, 4, 2);
 constexpr Chipset kGfx950 = Chipset(9, 5, 0);
+constexpr Chipset kGfx1250 = Chipset(12, 5, 0);
 
 /// Convert an unsigned number `val` to i32.
 static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter,
@@ -1149,7 +1150,7 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
                                  k, isRDNA3);
 
   // Handle gfx1250.
-  if (chipset == Chipset{12, 5, 0})
+  if (chipset == kGfx1250)
     return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType,
                                     elemDestType, k);
 
@@ -1300,7 +1301,7 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
     if (chipset.majorVersion != 11 && chipset.majorVersion != 12)
       return op->emitOpError("WMMA only supported on gfx11 and gfx12");
 
-    bool isGFX1250 = chipset >= Chipset(12, 5, 0);
+    bool isGFX1250 = chipset >= kGfx1250;
 
     // The WMMA operations represent vectors of bf16s as vectors of i16s
     // (except on gfx1250), so we need to bitcast bfloats to i16 and then
@@ -1505,6 +1506,19 @@ struct ExtPackedFp8OpLowering final
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+struct ScaledExtPacked816OpLowering final
+    : public ConvertOpToLLVMPattern<ScaledExtPacked816Op> {
+  ScaledExtPacked816OpLowering(const LLVMTypeConverter &converter,
+                               Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::ScaledExtPacked816Op>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 struct PackedTrunc2xFp8OpLowering final
     : public ConvertOpToLLVMPattern<PackedTrunc2xFp8Op> {
   PackedTrunc2xFp8OpLowering(const LLVMTypeConverter &converter,
@@ -1613,6 +1627,170 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
   return success();
 }
 
+int32_t getScaleSel(int32_t blockSize, unsigned bitWidth,
+                    int32_t firstScaleLane, int32_t firstScaleByte) {
+  // When lowering amdgpu.scaled_ext_packed816 to rocdl.cvt.scale.pk*.f*.f*
+  // operations, the attributes blockSize, sourceType, firstScaleLane and
+  // firstScaleByte are merged into a single attribute scaleSel. This is how
+  // those values are merged together.
+  assert(llvm::is_contained({16, 32}, blockSize));
+  assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));
+
+  const bool is_fp8 = bitWidth == 8;
+  const bool is_block_16 = blockSize == 16;
+
+  if (!is_fp8) {
+    int bit_0 = is_block_16;
+    assert(llvm::is_contained({0, 1, 2}, firstScaleByte));
+    int bit_1 = (firstScaleByte == 2) << 1;
+    assert(llvm::is_contained({0, 1}, firstScaleLane));
+    int bit_2 = firstScaleLane << 2;
+    return bit_2 | bit_1 | bit_0;
+  }
+
+  int bit_0 = is_block_16;
+  // firstScaleByte is guaranteed to be defined by two bits.
+  assert(llvm::is_contained({0, 1, 2, 3}, firstScaleByte));
+  int bit_2_and_1 = firstScaleByte << 1;
+  assert(llvm::is_contained({0, 1}, firstScaleLane));
+  int bit_3 = firstScaleLane << 3;
+  int bits = bit_3 | bit_2_and_1 | bit_0;
+  // These are invalid cases.
+  assert(!llvm::is_contained(
+      {0b0011, 0b0101, 0b0111, 0b1000, 0b1001, 0b1011, 0b1111}, bits));
+  return bits;
+}
+
+static std::optional<StringRef>
+scaledExtPacked816ToIntrinsic(Type srcElemType, Type destElemType) {
+  using fp4 = Float4E2M1FNType;
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+  using fp6 = Float6E2M3FNType;
+  using bf6 = Float6E3M2FNType;
+  if (isa<fp4>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Fp4Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Fp4Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Fp4Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<fp8>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Fp8Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Fp8Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Fp8Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<bf8>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Bf8Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Bf8Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Bf8Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<fp6>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk16F16Fp6Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk16Bf16Fp6Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk16F32Fp6Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<bf6>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk16F16Bf6Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk16Bf16Bf6Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk16F32Bf6Op::getOperationName();
+    return std::nullopt;
+  }
+  llvm_unreachable("invalid combination of element types for packed conversion "
+                   "instructions");
+}
+
+LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
+    ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  using fp4 = Float4E2M1FNType;
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+  using fp6 = Float6E2M3FNType;
+  using bf6 = Float6E3M2FNType;
+  Location loc = op.getLoc();
+  if (chipset != kGfx1250) {
+    return rewriter.notifyMatchFailure(
+        loc,
+        "Scaled fp packed conversion instructions are not available on target "
+        "architecture and their emulation is not implemented");
+  }
+  int32_t firstScaleLane = op.getFirstScaleLane();
+  int32_t firstScaleByte = op.getFirstScaleByte();
+  int32_t blockSize = op.getBlockSize();
+  auto sourceType = cast<VectorType>(op.getSource().getType());
+  auto srcElemType = cast<FloatType>(sourceType.getElementType());
+  unsigned bitWidth = srcElemType.getWidth();
+  int32_t scaleSel =
+      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
+
+  auto targetType = cast<VectorType>(op.getResult().getType());
+  auto destElemType = cast<FloatType>(targetType.getElementType());
+  IntegerType i32 = rewriter.getI32Type();
+  Value castedScale =
+      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
+
+  Value source = adaptor.getSource();
+  Type llvmResultType = typeConverter->convertType(op.getResult().getType());
+  Type packedType = nullptr;
+  if (isa<fp4>(srcElemType)) {
+    packedType = i32;
+    packedType = getTypeConverter()->convertType(packedType);
+  } else if (isa<fp8, bf8>(srcElemType)) {
+    packedType = VectorType::get(2, i32);
+    packedType = getTypeConverter()->convertType(packedType);
+  } else if (isa<fp6, bf6>(srcElemType)) {
+    packedType = VectorType::get(3, i32);
+    packedType = getTypeConverter()->convertType(packedType);
+  } else {
+    llvm_unreachable("invalid element type for packed scaled ext");
+  }
+
+  if (!packedType || !llvmResultType) {
+    return rewriter.notifyMatchFailure(op, "type conversion failed");
+  }
+
+  Value castedSource =
+      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
+
+  std::optional<StringRef> maybeIntrinsic =
+      scaledExtPacked816ToIntrinsic(srcElemType, destElemType);
+  if (!maybeIntrinsic.has_value())
+    return op.emitOpError(
+        "no intrinsic matching packed scaled conversion on the given chipset");
+
+  OperationState loweredOp(loc, *maybeIntrinsic);
+  loweredOp.addTypes({llvmResultType});
+  loweredOp.addOperands({castedSource, castedScale});
+
+  SmallVector<NamedAttribute, 1> attrs;
+  attrs.push_back(
+      NamedAttribute("scaleSel", rewriter.getI32IntegerAttr(scaleSel)));
+
+  loweredOp.addAttributes(attrs);
+  Operation *lowered = rewriter.create(loweredOp);
+  rewriter.replaceOp(op, lowered);
+
+  return success();
+}
+
 LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
     ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -2151,9 +2329,10 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
            AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
            SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
-           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
-           PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
-           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
-           TransposeLoadOpLowering, AMDGPUPermlaneLowering>(converter, chipset);
+           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering,
+           ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
+           PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
+           GatherToLDSOpLowering, TransposeLoadOpLowering,
+           AMDGPUPermlaneLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index f2bacc3399144..cc3e8468f298b 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -281,6 +281,7 @@ ConstantOpLowering::matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
   return LLVM::detail::oneToOneRewrite(op, LLVM::ConstantOp::getOperationName(),
                                        adaptor.getOperands(), op->getAttrs(),
+                                       /*propAttr=*/Attribute{},
                                        *getTypeConverter(), rewriter);
 }
 
diff --git a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
index 86d02e6c6209f..6a0c21185983e 100644
--- a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
+++ b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
@@ -96,7 +96,8 @@ struct ConstantOpLowering : public ConvertOpToLLVMPattern<complex::ConstantOp> {
                   ConversionPatternRewriter &rewriter) const override {
     return LLVM::detail::oneToOneRewrite(
         op, LLVM::ConstantOp::getOperationName(), adaptor.getOperands(),
-        op->getAttrs(), *getTypeConverter(), rewriter);
+        op->getAttrs(), /*propAttr=*/Attribute{}, *getTypeConverter(),
+        rewriter);
   }
 };
 
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
index 48a03198fd465..f28a6ccb42455 100644
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -296,19 +296,13 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
 // Detail methods
 //===----------------------------------------------------------------------===//
 
-void LLVM::detail::setNativeProperties(Operation *op,
-                                       IntegerOverflowFlags overflowFlags) {
-  if (auto iface = dyn_cast<IntegerOverflowFlagsInterface>(op))
-    iface.setOverflowFlags(overflowFlags);
-}
-
 /// Replaces the given operation "op" with a new operation of type "targetOp"
 /// and given operands.
 LogicalResult LLVM::detail::oneToOneRewrite(
     Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags) {
+    ArrayRef<NamedAttribute> targetAttrs, Attribute propertiesAttr,
+    const LLVMTypeConverter &typeConverter,
+    ConversionPatternRewriter &rewriter) {
   unsigned numResults = op->getNumResults();
 
   SmallVector<Type> resultTypes;
@@ -320,11 +314,10 @@ LogicalResult LLVM::detail::oneToOneRewrite(
   }
 
   // Create the operation through state since we don't know its C++ type.
-  Operation *newOp =
-      rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp), operands,
-                      resultTypes, targetAttrs);
-
-  setNativeProperties(newOp, overflowFlags);
+  OperationState state(op->getLoc(), rewriter.getStringAttr(targetOp), operands,
+                       resultTypes, targetAttrs);
+  state.propertiesAttr = propertiesAttr;
+  Operation *newOp = rewriter.create(state);
 
   // If the operation produced 0 or 1 result, return them immediately.
   if (numResults == 0)
diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
index e7dd0b506e12d..24b01259f0499 100644
--- a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
@@ -105,9 +105,9 @@ LogicalResult LLVM::detail::handleMultidimensionalVectors(
 
 LogicalResult LLVM::detail::vectorOneToOneRewrite(
     Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags) {
+    ArrayRef<NamedAttribute> targetAttrs, Attribute propertiesAttr,
+    const LLVMTypeConverter &typeConverter,
+    ConversionPatternRewriter &rewriter) {
   assert(!operands.empty());
 
   // Cannot convert ops if their operands are not of LLVM type.
@@ -116,15 +116,14 @@ LogicalResult LLVM::detail::vectorOneToOneRewrite(
 
   auto llvmNDVectorTy = operands[0].getType();
   if (!isa<LLVM::LLVMArrayType>(llvmNDVectorTy))
-    return oneToOneRewrite(op, targetOp, operands, targetAttrs, typeConverter,
-                           rewriter, overflowFlags);
-
-  auto callback = [op, targetOp, targetAttrs, overflowFlags,
+    return oneToOneRewrite(op, targetOp, operands, targetAttrs, propertiesAttr,
+                           typeConverter, rewriter);
+  auto callback = [op, targetOp, targetAttrs, propertiesAttr,
                    &rewriter](Type llvm1DVectorTy, ValueRange operands) {
-    Operation *newOp =
-        rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp),
-                        operands, llvm1DVectorTy, targetAttrs);
-    LLVM::detail::setNativeProperties(newOp, overflowFlags);
+    OperationState state(op->getLoc(), rewriter.getStringAttr(targetOp),
+                         operands, llvm1DVectorTy, targetAttrs);
+    state.propertiesAttr = propertiesAttr;
+    Operation *newOp = rewriter.create(state);
     return newOp->getResult(0);
   };
 
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 5c35823678576..d55f3cec47c1f 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -343,28 +343,41 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 LogicalResult ScaledExtPacked816Op::verify() {
   int blockSize = getBlockSize();
-  assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+  assert(llvm::is_contained({16, 32}, blockSize) && "invalid block size");
 
   int firstScaleByte = getFirstScaleByte();
+  int firstScaleLane = getFirstScaleLane();
   auto sourceType = cast<VectorType>(getSource().getType());
   Type elementType = sourceType.getElementType();
   auto floatType = cast<FloatType>(elementType);
-  int bitWidth = floatType.getWidth();
+  unsigned bitWidth = floatType.getWidth();
 
-  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 &&
-      !llvm::is_contained({0, 1}, firstScaleByte)) {
-    return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 "
-                       "for f4 and f6.");
-  }
-  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 &&
-      !llvm::is_contained({0, 2}, firstScaleByte)) {
-    return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 "
-                       "for f4 and f6.");
-  }
-  if (bitWidth == 8 && blockSize == 16 &&
-      !llvm::is_contained({0, 2}, firstScaleByte)) {
-    return emitOpError(
-        "blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.");
+  assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));
+
+  const bool is_fp8 = bitWidth == 8;
+  const bool is_block_16 = blockSize == 16;
+
+  if (!is_fp8) {
+    if (is_block_16) {
+      if (!llvm::is_contained({0, 1}, firstScaleByte)) {
+        return emitOpError("blockSize of 16 can only have firstScaleByte be 0 "
+                           "or 1 for f4 and f6.");
+      }
+    } else {
+      if (!llvm::is_contained({0, 2}, firstScaleByte)) {
+        return emitOpError("blockSize of 32 can only have firstScaleByte be 0 "
+                           "or 2 for f4 and f6.");
+      }
+    }
+  } else {
+    if (is_block_16) {
+      bool is_valid = ((firstScaleLane == 0) && (firstScaleByte == 0)) ||
+                      ((firstScaleLane == 1) && (firstScaleByte == 2));
+      if (!is_valid) {
+        return emitOpError("blockSize of 16 can only have (firstScaleLane, "
+                           "firstScaleByte) be (0, 0) or (1, 2) for f8.");
+      }
+    }
   }
 
   return success();
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index de3efc9fe3506..e256915933a71 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -389,8 +389,8 @@ def TruncIExtUIToExtUI :
 // trunci(shrsi(x, c)) -> trunci(shrui(x, c))
 def TruncIShrSIToTrunciShrUI :
     Pat<(Arith_TruncIOp:$tr
-          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0)), $overflow),
-        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0))), $overflow),
+          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0), $exact), $overflow),
+        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0)), $exact), $overflow),
         [(TruncationMatchesShiftAmount $x, $tr, $c0)]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 7bfc3f6664d74..3ea9b772398ee 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -4223,6 +4223,34 @@ LogicalResult InlineAsmOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// UDivOp
+//===----------------------------------------------------------------------===//
+Speculation::Speculatability UDivOp::getSpeculatability() {
+  // X / 0 => UB
+  Value divisor = getRhs();
+  if (matchPattern(divisor, m_IntRangeWithoutZeroU()))
+    return Speculation::Speculatable;
+
+  return Speculation::NotSpeculatable;
+}
+
+//===----------------------------------------------------------------------===//
+// SDivOp
+//===----------------------------------------------------------------------===//
+Speculation::Speculatability SDivOp::getSpeculatability() {
+  // This function conservatively assumes that all signed division by -1 are
+  // not speculatable.
+  // X / 0 => UB
+  // INT_MIN / -1 => UB
+  Value divisor = getRhs();
+  if (matchPattern(divisor, m_IntRangeWithoutZeroS()) &&
+      matchPattern(divisor, m_IntRangeWithoutNegOneS()))
+    return Speculation::Speculatable;
+
+  return Speculation::NotSpeculatable;
+}
+
 //===----------------------------------------------------------------------===//
 // LLVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp
new file mode 100644
index 0000000000000..12efaf487a8ca
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp
@@ -0,0 +1,237 @@
+//===- ACCImplicitRoutine.cpp - OpenACC Implicit Routine Transform -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the implicit rules described in OpenACC specification
+// for `Routine Directive` (OpenACC 3.4 spec, section 2.15.1).
+//
+// "If no explicit routine directive applies to a procedure whose definition
+// appears in the program unit being compiled, then the implementation applies
+// an implicit routine directive to that procedure if any of the following
+// conditions holds:
+// - The procedure is called or its address is accessed in a compute region."
+//
+// The specification further states:
+// "When the implementation applies an implicit routine directive to a
+// procedure, it must recursively apply implicit routine directives to other
+// procedures for which the above rules specify relevant dependencies. Such
+// dependencies can form a cycle, so the implementation must take care to avoid
+// infinite recursion."
+//
+// This pass implements these requirements by:
+// 1. Walking through all OpenACC compute constructs and functions already
+//    marked with `acc routine` in the module and identifying function calls
+//    within these regions.
+// 2. Creating implicit `acc.routine` operations for functions that don't
+//    already have routine declarations.
+// 3. Recursively walking through all existing `acc routine` and creating
+//    implicit routine operations for function calls within these routines,
+//    while avoiding infinite recursion through proper tracking.
+//
+// Requirements:
+// -------------
+// To use this pass in a pipeline, the following requirements must be met:
+//
+// 1. Operation Interface Implementation: Operations that define functions
+//    or call functions should implement `mlir::FunctionOpInterface` and
+//    `mlir::CallOpInterface` respectively.
+//
+// 2. Analysis Registration (Optional): If custom behavior is needed for
+//    determining if a symbol use is valid within GPU regions, the dialect
+//    should pre-register the `acc::OpenACCSupport` analysis.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include <queue>
+
+#define DEBUG_TYPE "acc-implicit-routine"
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCIMPLICITROUTINE
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+namespace {
+
+using namespace mlir;
+
+class ACCImplicitRoutine
+    : public acc::impl::ACCImplicitRoutineBase<ACCImplicitRoutine> {
+private:
+  unsigned routineCounter = 0;
+  static constexpr llvm::StringRef accRoutinePrefix = "acc_routine_";
+
+  // Count existing routine operations and update counter
+  void initRoutineCounter(ModuleOp module) {
+    module.walk([&](acc::RoutineOp routineOp) { routineCounter++; });
+  }
+
+  // Check if routine has a default bind clause or a device-type specific bind
+  // clause. Returns true if `acc routine` has a default bind clause or
+  // a device-type specific bind clause.
+  bool isACCRoutineBindDefaultOrDeviceType(acc::RoutineOp op,
+                                           acc::DeviceType deviceType) {
+    // Fast check to avoid device-type specific lookups.
+    if (!op.getBindIdName() && !op.getBindStrName())
+      return false;
+    return op.getBindNameValue().has_value() ||
+           op.getBindNameValue(deviceType).has_value();
+  }
+
+  // Generate a unique name for the routine and create the routine operation
+  acc::RoutineOp createRoutineOp(OpBuilder &builder, Location loc,
+                                 FunctionOpInterface &callee) {
+    std::string routineName =
+        (accRoutinePrefix + std::to_string(routineCounter++)).str();
+    auto routineOp = acc::RoutineOp::create(
+        builder, loc,
+        /* sym_name=*/builder.getStringAttr(routineName),
+        /* func_name=*/
+        mlir::SymbolRefAttr::get(builder.getContext(),
+                                 builder.getStringAttr(callee.getName())),
+        /* bindIdName=*/nullptr,
+        /* bindStrName=*/nullptr,
+        /* bindIdNameDeviceType=*/nullptr,
+        /* bindStrNameDeviceType=*/nullptr,
+        /* worker=*/nullptr,
+        /* vector=*/nullptr,
+        /* seq=*/nullptr,
+        /* nohost=*/nullptr,
+        /* implicit=*/builder.getUnitAttr(),
+        /* gang=*/nullptr,
+        /* gangDim=*/nullptr,
+        /* gangDimDeviceType=*/nullptr);
+
+    // Assert that the callee does not already have routine info attribute
+    assert(!callee->hasAttr(acc::getRoutineInfoAttrName()) &&
+           "function is already associated with a routine");
+
+    callee->setAttr(
+        acc::getRoutineInfoAttrName(),
+        mlir::acc::RoutineInfoAttr::get(
+            builder.getContext(),
+            {mlir::SymbolRefAttr::get(builder.getContext(),
+                                      builder.getStringAttr(routineName))}));
+    return routineOp;
+  }
+
+  // Used to walk through a compute region looking for function calls.
+  void
+  implicitRoutineForCallsInComputeRegions(Operation *op, SymbolTable &symTab,
+                                          mlir::OpBuilder &builder,
+                                          acc::OpenACCSupport &accSupport) {
+    op->walk([&](CallOpInterface callOp) {
+      if (!callOp.getCallableForCallee())
+        return;
+
+      auto calleeSymbolRef =
+          dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+      // When call is done through ssa value, the callee is not a symbol.
+      // Skip it because we don't know the call target.
+      if (!calleeSymbolRef)
+        return;
+
+      auto callee = symTab.lookup<FunctionOpInterface>(
+          calleeSymbolRef.getLeafReference().str());
+      // If the callee does not exist or is already a valid symbol for GPU
+      // regions, skip it
+
+      assert(callee && "callee function must be found in symbol table");
+      if (accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef))
+        return;
+      builder.setInsertionPoint(callee);
+      createRoutineOp(builder, callee.getLoc(), callee);
+    });
+  }
+
+  // Recursively handle calls within a routine operation
+  void implicitRoutineForCallsInRoutine(acc::RoutineOp routineOp,
+                                        mlir::OpBuilder &builder,
+                                        acc::OpenACCSupport &accSupport,
+                                        acc::DeviceType targetDeviceType) {
+    // When bind clause is used, it means that the target is different than the
+    // function to which the `acc routine` is used with. Skip this case to
+    // avoid implicitly recursively marking calls that would not end up on
+    // device.
+    if (isACCRoutineBindDefaultOrDeviceType(routineOp, targetDeviceType))
+      return;
+
+    SymbolTable symTab(routineOp->getParentOfType<ModuleOp>());
+    std::queue<acc::RoutineOp> routineQueue;
+    routineQueue.push(routineOp);
+    while (!routineQueue.empty()) {
+      auto currentRoutine = routineQueue.front();
+      routineQueue.pop();
+      auto func = symTab.lookup<FunctionOpInterface>(
+          currentRoutine.getFuncName().getLeafReference());
+      func.walk([&](CallOpInterface callOp) {
+        if (!callOp.getCallableForCallee())
+          return;
+
+        auto calleeSymbolRef =
+            dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+        // When call is done through ssa value, the callee is not a symbol.
+        // Skip it because we don't know the call target.
+        if (!calleeSymbolRef)
+          return;
+
+        auto callee = symTab.lookup<FunctionOpInterface>(
+            calleeSymbolRef.getLeafReference().str());
+        // If the callee does not exist or is already a valid symbol for GPU
+        // regions, skip it
+        assert(callee && "callee function must be found in symbol table");
+        if (accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef))
+          return;
+        builder.setInsertionPoint(callee);
+        auto newRoutineOp = createRoutineOp(builder, callee.getLoc(), callee);
+        routineQueue.push(newRoutineOp);
+      });
+    }
+  }
+
+public:
+  using ACCImplicitRoutineBase<ACCImplicitRoutine>::ACCImplicitRoutineBase;
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::OpBuilder builder(module.getContext());
+    SymbolTable symTab(module);
+    initRoutineCounter(module);
+
+    acc::OpenACCSupport &accSupport = getAnalysis<acc::OpenACCSupport>();
+
+    // Handle compute regions
+    module.walk([&](Operation *op) {
+      if (isa<ACC_COMPUTE_CONSTRUCT_OPS>(op))
+        implicitRoutineForCallsInComputeRegions(op, symTab, builder,
+                                                accSupport);
+    });
+
+    // Use the device type option from the pass options.
+    acc::DeviceType targetDeviceType = deviceType;
+
+    // Handle existing routines
+    module.walk([&](acc::RoutineOp routineOp) {
+      implicitRoutineForCallsInRoutine(routineOp, builder, accSupport,
+                                       targetDeviceType);
+    });
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index f8fff5958f8c7..028af0362f26e 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIROpenACCTransforms
   ACCImplicitData.cpp
+  ACCImplicitRoutine.cpp
   LegalizeDataValues.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index ffa8b402e0b6b..99048034b4f0c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -80,6 +80,53 @@ inline static bool includesDenseOutput(SortMask mask) {
   return includesAny(mask, SortMask::kIncludeDenseOutput);
 }
 
+/// Returns a sparsity rank for loop ordering: lower values indicate
+/// dimensions that should be placed in outer loops.
+/// 0 = Dense, 1 = Compressed, 2 = Singleton, 3 = Other/Unknown.
+static unsigned getLoopSparsityRank(unsigned loop, ArrayRef<Value> allTensors,
+                                    ArrayRef<AffineMap> allMaps) {
+  // Start with highest rank.
+  unsigned minRank = 3;
+
+  for (auto [tensor, map] : llvm::zip(allTensors, allMaps)) {
+    // Check if this loop accesses this tensor.
+    bool loopAccessesTensor = false;
+    unsigned tensorDim = 0;
+    for (AffineExpr expr : map.getResults()) {
+      if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+        if (dimExpr.getPosition() == loop) {
+          loopAccessesTensor = true;
+          break;
+        }
+      }
+      tensorDim++;
+    }
+
+    if (loopAccessesTensor) {
+      const auto enc = getSparseTensorEncoding(tensor.getType());
+      if (!enc) {
+        // Dense tensor - lowest rank.
+        return 0;
+      } else {
+        // Sparse tensor - check the level type for this dimension.
+        auto lvlTypes = enc.getLvlTypes();
+        if (tensorDim < lvlTypes.size()) {
+          auto lvlType = lvlTypes[tensorDim];
+          if (isDenseLT(lvlType)) {
+            return 0; // Dense level.
+          } else if (isCompressedLT(lvlType)) {
+            minRank = std::min(minRank, 1u); // Compressed level.
+          } else if (isSingletonLT(lvlType)) {
+            minRank = std::min(minRank, 2u); // Singleton level.
+          }
+        }
+      }
+    }
+  }
+
+  return minRank;
+}
+
 AffineMap IterationGraphSorter::topoSort() {
   // The sorted result will put the first Reduction iterator to the
   // latest possible position.
@@ -107,10 +154,33 @@ AffineMap IterationGraphSorter::topoSort() {
     case sparse_tensor::LoopOrderingStrategy::kDefault:
       src = it.back();
       break;
+    case sparse_tensor::LoopOrderingStrategy::kDenseOuter: {
+      // Prefer dense, then compressed, then singleton dimensions outermost.
+      // Create combined tensor and map lists for analysis.
+      SmallVector<Value> allTensors = ins;
+      allTensors.push_back(out);
+      SmallVector<AffineMap> allMaps = loop2InsLvl;
+      allMaps.push_back(loop2OutLvl);
+
+      // Find loop with minimum (lowest) sparsity rank.
+      unsigned minLoop = it[0];
+      unsigned minRank = getLoopSparsityRank(minLoop, allTensors, allMaps);
+
+      for (auto candidateLoop : it) {
+        unsigned rank = getLoopSparsityRank(candidateLoop, allTensors, allMaps);
+        if (rank < minRank || (rank == minRank && candidateLoop < minLoop)) {
+          minLoop = candidateLoop;
+          minRank = rank;
+        }
+      }
+      src = minLoop;
+      break;
+    }
     }
 
     loopOrder.push_back(src);
-    it.pop_back();
+    // Remove the selected loop from the worklist.
+    it.erase(std::find(it.begin(), it.end(), src));
     // Update in-degree, and push 0-degree node into worklist.
     for (unsigned dst = 0; dst < numLoops; dst++) {
       if (itGraph[src][dst] && --inDegree[dst] == 0) {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4dd10bedc6d84..85c9a966f0fe8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -901,7 +901,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint,
-                         xegpu::LayoutAttr layout) {
+                         DistributeLayoutAttr layout) {
   auto loc = source.getLoc();
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
@@ -985,7 +985,7 @@ void StoreScatterOp::build(
     OpBuilder &builder, OperationState &state, Value value, Value dest,
     ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
     xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
-    xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) {
+    xegpu::CachePolicyAttr l3_hint, DistributeLayoutAttr layout) {
   auto loc = dest.getLoc();
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c3bf9606693a8..330553564f81a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
-    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
@@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
-    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0a9ef0aa6df96..33d4b0457e5d3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -889,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
-        xegpu::getDistributeLayoutAttr(op.getResult()));
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -913,10 +913,12 @@ struct WgToSgLoadGatherOpWithOffset
     VectorType newTy = VectorType::get(sgShape, resultType.getElementType());
     for (auto [offsets, mask] :
          llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
+      auto newLayout = layout.dropSgLayoutAndData();
       auto newLoadOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
           op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
-          layout.dropSgLayoutAndData());
+          newLayout);
+      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout);
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -941,8 +943,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
-        xegpu::getDistributeLayoutAttr(op.getOperand(0)));
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getOperand(0));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 2fd3df6dcfa71..432b8876696a9 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -456,3 +456,4 @@ func.func @sched_barrier() {
   amdgpu.sched_barrier allow = <valu|all_vmem>
   func.return
 }
+
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
new file mode 100644
index 0000000000000..d2391140ce056
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics \
+// RUN: | FileCheck %s
+
+// CHECK-LABEL: @scaled_ext_packed816_fp4
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf4E2M1FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi4:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf4E2M1FN> to vector<8xi4>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.f16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.f32.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2: vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_fp8
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E4M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E4M3FN> to vector<8xi8>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f32.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_bf8
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E5M2>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E5M2> to vector<8xi8>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: %[[RES:.+]] = rocdl.cvt.scale.pk8.f16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.bf16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f32.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+
+// CHECK-LABEL: @scaled_ext_packed816_fp6
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E2M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E2M3FN> to vector<16xi6>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.bf16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f32.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_bf6
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E3M2FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_bf6(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E3M2FN> to vector<16xi6>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.bf16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f32.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have (firstScaleLane, firstScaleByte) be (0, 0) or (1, 2) for f8.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_src_elem_type(%v: vector<16xf16>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op operand #0 must be}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf16>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  return %ret0: vector<16xf16>
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf64>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op result #0 must be vector}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64>
+  return %ret0: vector<16xf64>
+}
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index 5f1ec66234df2..6fdc1104d2609 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -738,6 +738,22 @@ func.func @ops_supporting_overflow(%arg0: i64, %arg1: i64) {
 
 // -----
 
+// CHECK-LABEL: @ops_supporting_exact
+func.func @ops_supporting_exact(i32, i32) {
+^bb0(%arg0: i32, %arg1: i32):
+// CHECK: = llvm.ashr exact %arg0, %arg1 : i32
+  %0 = arith.shrsi %arg0, %arg1 exact : i32
+// CHECK: = llvm.lshr exact %arg0, %arg1 : i32
+  %1 = arith.shrui %arg0, %arg1 exact : i32
+// CHECK: = llvm.sdiv exact %arg0, %arg1 : i32
+  %2 = arith.divsi %arg0, %arg1 exact : i32
+// CHECK: = llvm.udiv exact %arg0, %arg1 : i32
+  %3 = arith.divui %arg0, %arg1 exact : i32
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_bitcast
 //  CHECK-SAME:   (%[[ARG:.*]]: memref<?xi16>)
 //       CHECK:   %[[V1:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref<?xi16> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 5c8cc8b67c4b3..61fdf29a78cbd 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -333,38 +333,6 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
 
 // -----
 
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
-  func.return
-}
-
-// -----
-
 func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
   // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
   %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 2fe0995c9d4df..3ad1530248809 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2958,6 +2958,19 @@ func.func @truncIShrSIToTrunciShrUI(%a: i64) -> i32 {
   return %hi : i32
 }
 
+// CHECK-LABEL: @truncIShrSIExactToTrunciShrUIExact
+//  CHECK-SAME:   (%[[A:.+]]: i64)
+//  CHECK-NEXT:   %[[C32:.+]] = arith.constant 32 : i64
+//  CHECK-NEXT:   %[[SHR:.+]] = arith.shrui %[[A]], %[[C32]] exact : i64
+//  CHECK-NEXT:   %[[TRU:.+]] = arith.trunci %[[SHR]] : i64 to i32
+//  CHECK-NEXT:   return %[[TRU]] : i32
+func.func @truncIShrSIExactToTrunciShrUIExact(%a: i64) -> i32 {
+  %c32 = arith.constant 32: i64
+  %sh = arith.shrsi %a, %c32 exact : i64
+  %hi = arith.trunci %sh: i64 to i32
+  return %hi : i32
+}
+
 // CHECK-LABEL: @truncIShrSIToTrunciShrUIBadShiftAmt1
 //       CHECK:   arith.shrsi
 func.func @truncIShrSIToTrunciShrUIBadShiftAmt1(%a: i64) -> i32 {
diff --git a/mlir/test/Dialect/Arith/ops.mlir b/mlir/test/Dialect/Arith/ops.mlir
index 1e656e84da836..58eadfda17060 100644
--- a/mlir/test/Dialect/Arith/ops.mlir
+++ b/mlir/test/Dialect/Arith/ops.mlir
@@ -151,6 +151,12 @@ func.func @test_divui(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_divui_exact
+func.func @test_divui_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.divui %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_divui_tensor
 func.func @test_divui_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.divui %arg0, %arg1 : tensor<8x8xi64>
@@ -175,6 +181,12 @@ func.func @test_divsi(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_divsi_exact
+func.func @test_divsi_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.divsi %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_divsi_tensor
 func.func @test_divsi_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.divsi %arg0, %arg1 : tensor<8x8xi64>
@@ -391,6 +403,12 @@ func.func @test_shrui(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_shrui_exact
+func.func @test_shrui_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.shrui %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_shrui_tensor
 func.func @test_shrui_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.shrui %arg0, %arg1 : tensor<8x8xi64>
@@ -415,6 +433,12 @@ func.func @test_shrsi(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_shrsi_exact
+func.func @test_shrsi_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.shrsi %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_shrsi_tensor
 func.func @test_shrsi_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.shrsi %arg0, %arg1 : tensor<8x8xi64>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 4fbb566cfbe73..5dde84e8e0bc2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -547,4 +547,21 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: distribute_load_slice_attr
+  gpu.func @distribute_load_slice_attr() {
+    %2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
+    %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
+    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
+
+    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
+    // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
+    // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+    %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
+
+    // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<32xf32> to vector<32x32xf32>
+    %4 = vector.broadcast %3 {layout_result_0 =
+        #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+    gpu.return
+  }
 }
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index c1604e226a334..31a4f64dd7de0 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -880,6 +880,18 @@ func.func @no_speculate_divui(
   return
 }
 
+func.func @no_speculate_udiv(
+// CHECK-LABEL: @no_speculate_udiv(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %denom : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi(
 // CHECK-LABEL: @no_speculate_divsi(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -892,6 +904,18 @@ func.func @no_speculate_divsi(
   return
 }
 
+func.func @no_speculate_sdiv(
+// CHECK-LABEL: @no_speculate_sdiv(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %denom : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui(
 // CHECK-LABEL: @no_speculate_ceildivui(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -928,6 +952,18 @@ func.func @no_speculate_divui_const(%num: i32, %lb: index, %ub: index, %step: in
   return
 }
 
+func.func @no_speculate_udiv_const(%num: i32, %lb: index, %ub: index, %step: index) {
+// CHECK-LABEL: @no_speculate_udiv_const(
+  %c0 = arith.constant 0 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %c0 : i32
+  }
+
+  return
+}
+
 func.func @speculate_divui_const(
 // CHECK-LABEL: @speculate_divui_const(
     %num: i32, %lb: index, %ub: index, %step: index) {
@@ -941,6 +977,19 @@ func.func @speculate_divui_const(
   return
 }
 
+func.func @speculate_udiv_const(
+// CHECK-LABEL: @speculate_udiv_const(
+    %num: i32, %lb: index, %ub: index, %step: index) {
+  %c5 = llvm.mlir.constant(5 : i32) : i32
+// CHECK: llvm.udiv
+// CHECK: scf.for
+  scf.for %i = %lb to %ub step %step {
+    %val = llvm.udiv %num, %c5 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui_const(%num: i32, %lb: index, %ub: index, %step: index) {
 // CHECK-LABEL: @no_speculate_ceildivui_const(
   %c0 = arith.constant 0 : i32
@@ -979,6 +1028,19 @@ func.func @no_speculate_divsi_const0(
   return
 }
 
+func.func @no_speculate_sdiv_const0(
+// CHECK-LABEL: @no_speculate_sdiv_const0(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %c0 = arith.constant 0 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %c0 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi_const_minus1(
 // CHECK-LABEL: @no_speculate_divsi_const_minus1(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -992,6 +1054,19 @@ func.func @no_speculate_divsi_const_minus1(
   return
 }
 
+func.func @no_speculate_sdiv_const_minus1(
+// CHECK-LABEL: @no_speculate_sdiv_const_minus1(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %cm1 = arith.constant -1 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %cm1 : i32
+  }
+
+  return
+}
+
 func.func @speculate_divsi_const(
 // CHECK-LABEL: @speculate_divsi_const(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -1005,6 +1080,19 @@ func.func @speculate_divsi_const(
   return
 }
 
+func.func @speculate_sdiv_const(
+// CHECK-LABEL: @speculate_sdiv_const(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %c5 = arith.constant 5 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: llvm.sdiv
+// CHECK: scf.for
+    %val = llvm.sdiv %num, %c5 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivsi_const0(
 // CHECK-LABEL: @no_speculate_ceildivsi_const0(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -1057,6 +1145,19 @@ func.func @no_speculate_divui_range(
   return
 }
 
+func.func @no_speculate_udiv_range(
+// CHECK-LABEL: @no_speculate_udiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %denom : i8
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi_range(
 // CHECK-LABEL: @no_speculate_divsi_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1072,6 +1173,21 @@ func.func @no_speculate_divsi_range(
   return
 }
 
+func.func @no_speculate_sdiv_range(
+// CHECK-LABEL: @no_speculate_sdiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom0 = test.with_bounds {smax = -1: i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  %denom1 = test.with_bounds {smax = 127 : i8, smin = 0 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK-COUNT-2: llvm.sdiv
+    %val0 = llvm.sdiv %num, %denom0 : i8
+    %val1 = llvm.sdiv %num, %denom1 : i8
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui_range(
 // CHECK-LABEL: @no_speculate_ceildivui_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1113,6 +1229,19 @@ func.func @speculate_divui_range(
   return
 }
 
+func.func @speculate_udiv_range(
+// CHECK-LABEL: @speculate_udiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 1 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: llvm.udiv
+// CHECK: scf.for
+    %val = llvm.udiv %num, %denom : i8
+  }
+
+  return
+}
+
 func.func @speculate_divsi_range(
 // CHECK-LABEL: @speculate_divsi_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1129,6 +1258,22 @@ func.func @speculate_divsi_range(
   return
 }
 
+func.func @speculate_sdiv_range(
+// CHECK-LABEL: @speculate_sdiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom0 = test.with_bounds {smax = 127 : i8, smin = 1 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  %denom1 = test.with_bounds {smax = -2 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK-COUNT-2: llvm.sdiv
+// CHECK: scf.for
+    %val0 = llvm.sdiv %num, %denom0 : i8
+    %val1 = llvm.sdiv %num, %denom1 : i8
+
+  }
+
+  return
+}
+
 func.func @speculate_ceildivui_range(
 // CHECK-LABEL: @speculate_ceildivui_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
diff --git a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
index 7cc130d02ad74..568126fd342cc 100644
--- a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
@@ -4,4 +4,5 @@ add_mlir_unittest(MLIRLLVMIRTests
 mlir_target_link_libraries(MLIRLLVMIRTests
   PRIVATE
   MLIRLLVMDialect
+  MLIRInferIntRangeInterface
   )
diff --git a/openmp/tools/omptest/test/CMakeLists.txt b/openmp/tools/omptest/test/CMakeLists.txt
index 1e07a1044f7d6..2b4aa78b0bc16 100644
--- a/openmp/tools/omptest/test/CMakeLists.txt
+++ b/openmp/tools/omptest/test/CMakeLists.txt
@@ -9,7 +9,7 @@ set(UNITTEST_SOURCES
   unittests/asserter-seq-test.cpp
   unittests/internal-event-eq-test.cpp
   unittests/internal-event-tostring-test.cpp
-  unittests/internal-util-test
+  unittests/internal-util-test.cpp
   unittests/main-test.cpp
 )
 add_executable(omptest-unittests ${UNITTEST_SOURCES})
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 788c6570081a2..a27abbd5b386a 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1103,6 +1103,7 @@ libc_support_library(
         ":func_realloc",
         ":hdr_stdio_macros",
         ":hdr_stdio_overlay",
+        ":string_memory_utils",
         ":types_off_t",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 67c397e34b8c7..85c64ffd58ca6 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2193,92 +2193,253 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "AArch64",
         "short_name": "AArch64",
-        "tbl_outs": {
-            "lib/Target/AArch64/AArch64GenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/AArch64/AArch64GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AArch64/AArch64GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AArch64/AArch64GenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AArch64/AArch64GenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/AArch64/AArch64GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AArch64/AArch64GenAsmWriter1.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/AArch64/AArch64GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AArch64/AArch64GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AArch64/AArch64GenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/AArch64/AArch64GenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/AArch64/AArch64GenO0PreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64O0PreLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PreLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PostLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PostLegalizerLowering",
-            ],
-            "lib/Target/AArch64/AArch64GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AArch64/AArch64GenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/AArch64/AArch64GenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/AArch64/AArch64GenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/AArch64/AArch64GenSystemOperands.inc": ["-gen-searchable-tables"],
-            "lib/Target/AArch64/AArch64GenExegesis.inc": ["-gen-exegesis"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/AArch64/AArch64GenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AArch64/AArch64GenRegisterInfo.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoEnums.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoMCDesc.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoHeader.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AArch64/AArch64GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AArch64/AArch64GenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/AArch64/AArch64GenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AArch64/AArch64GenAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/AArch64/AArch64GenAsmWriter1.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AArch64/AArch64GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AArch64/AArch64GenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/AArch64/AArch64GenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/AArch64/AArch64GenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64O0PreLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenO0PreLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PreLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PostLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PostLegalizerLowering",
+                ],
+                "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AArch64/AArch64GenCallingConv.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AArch64/AArch64GenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AArch64/AArch64GenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/AArch64/AArch64GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/AArch64/AArch64GenSystemOperands.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/AArch64/AArch64GenExegesis.inc",
+            ),
+        ],
     },
     {
         "name": "ARM",
         "short_name": "ARM",
-        "tbl_outs": {
-            "lib/Target/ARM/ARMGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/ARM/ARMGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/ARM/ARMGenSystemRegister.inc": ["-gen-searchable-tables"],
-            "lib/Target/ARM/ARMGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/ARM/ARMGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/ARM/ARMGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/ARM/ARMGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/ARM/ARMGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/ARM/ARMGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/ARM/ARMGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/ARM/ARMGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/ARM/ARMGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/ARM/ARMGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/ARM/ARMGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "-ignore-non-decodable-operands",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/ARM/ARMGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/ARM/ARMGenRegisterInfo.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoEnums.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoMCDesc.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoHeader.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/ARM/ARMGenSystemRegister.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/ARM/ARMGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/ARM/ARMGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/ARM/ARMGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/ARM/ARMGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/ARM/ARMGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/ARM/ARMGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/ARM/ARMGenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/ARM/ARMGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/ARM/ARMGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/ARM/ARMGenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "-ignore-non-decodable-operands",
+                ],
+                "lib/Target/ARM/ARMGenDisassemblerTables.inc",
+            ),
+        ],
     },
     {
         "name": "AMDGPU",
         "short_name": "AMDGPU",
-        "tbl_outs": {
-            "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "--specialize-decoders-per-bitwidth",
-                "-ignore-non-decodable-operands",
-                "-ignore-fully-defined-operands",
-            ],
-            "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc": ["-gen-searchable-tables"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoEnums.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoMCDesc.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoHeader.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "--specialize-decoders-per-bitwidth",
+                    "-ignore-non-decodable-operands",
+                    "-ignore-fully-defined-operands",
+                ],
+                "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AMDGPU/AMDGPUGenSDNodeInfo.inc",
+            ),
+        ],
         "tbl_deps": [
             ":InstCombineTableGen",
             ":amdgpu_isel_target_gen",
@@ -2288,184 +2449,567 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "AVR",
         "short_name": "AVR",
-        "tbl_outs": {
-            "lib/Target/AVR/AVRGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AVR/AVRGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AVR/AVRGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AVR/AVRGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AVR/AVRGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/AVR/AVRGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AVR/AVRGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AVR/AVRGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AVR/AVRGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/AVR/AVRGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AVR/AVRGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AVR/AVRGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AVR/AVRGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AVR/AVRGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/AVR/AVRGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AVR/AVRGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AVR/AVRGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AVR/AVRGenRegisterInfo.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoEnums.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoMCDesc.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoHeader.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AVR/AVRGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AVR/AVRGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "BPF",
         "short_name": "BPF",
-        "tbl_outs": {
-            "lib/Target/BPF/BPFGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/BPF/BPFGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/BPF/BPFGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/BPF/BPFGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/BPF/BPFGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/BPF/BPFGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/BPF/BPFGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/BPF/BPFGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/BPF/BPFGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/BPF/BPFGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/BPF/BPFGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/BPF/BPFGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/BPF/BPFGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/BPF/BPFGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/BPF/BPFGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/BPF/BPFGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/BPF/BPFGenDAGISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/BPF/BPFGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/BPF/BPFGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/BPF/BPFGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/BPF/BPFGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/BPF/BPFGenRegisterInfo.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoEnums.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoMCDesc.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoHeader.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/BPF/BPFGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/BPF/BPFGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Hexagon",
         "short_name": "Hexagon",
-        "tbl_outs": {
-            "lib/Target/Hexagon/HexagonGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Hexagon/HexagonGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Hexagon/HexagonGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Hexagon/HexagonGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Hexagon/HexagonGenDFAPacketizer.inc": ["-gen-dfa-packetizer"],
-            "lib/Target/Hexagon/HexagonGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/Hexagon/HexagonGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Hexagon/HexagonGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Hexagon/HexagonGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Hexagon/HexagonGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Hexagon/HexagonGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Hexagon/HexagonGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Hexagon/HexagonGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Hexagon/HexagonGenDAGISel.inc",
+            ),
+            (
+                ["-gen-dfa-packetizer"],
+                "lib/Target/Hexagon/HexagonGenDFAPacketizer.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Hexagon/HexagonGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Hexagon/HexagonGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Hexagon/HexagonGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Hexagon/HexagonGenRegisterInfo.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoEnums.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoHeader.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Hexagon/HexagonGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Lanai",
         "short_name": "Lanai",
-        "tbl_outs": {
-            "lib/Target/Lanai/LanaiGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Lanai/LanaiGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Lanai/LanaiGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Lanai/LanaiGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Lanai/LanaiGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/Lanai/LanaiGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Lanai/LanaiGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Lanai/LanaiGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Lanai/LanaiGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/Lanai/LanaiGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Lanai/LanaiGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Lanai/LanaiGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Lanai/LanaiGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Lanai/LanaiGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Lanai/LanaiGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Lanai/LanaiGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Lanai/LanaiGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Lanai/LanaiGenRegisterInfo.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoEnums.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoHeader.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/Lanai/LanaiGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Lanai/LanaiGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "LoongArch",
         "short_name": "LoongArch",
-        "tbl_outs": {
-            "lib/Target/LoongArch/LoongArchGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/LoongArch/LoongArchGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/LoongArch/LoongArchGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/LoongArch/LoongArchGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/LoongArch/LoongArchGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/LoongArch/LoongArchGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/LoongArch/LoongArchGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/LoongArch/LoongArchGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/LoongArch/LoongArchGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/LoongArch/LoongArchGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/LoongArch/LoongArchGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/LoongArch/LoongArchGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/LoongArch/LoongArchGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/LoongArch/LoongArchGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/LoongArch/LoongArchGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/LoongArch/LoongArchGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfo.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoEnums.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoMCDesc.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoHeader.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/LoongArch/LoongArchGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Mips",
         "short_name": "Mips",
-        "tbl_outs": {
-            "lib/Target/Mips/MipsGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Mips/MipsGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Mips/MipsGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Mips/MipsGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Mips/MipsGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "-ignore-non-decodable-operands",
-            ],
-            "lib/Target/Mips/MipsGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Mips/MipsGenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/Mips/MipsGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/Mips/MipsGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/Mips/MipsGenPostLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=MipsPostLegalizerCombiner",
-            ],
-            "lib/Target/Mips/MipsGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Mips/MipsGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/Mips/MipsGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/Mips/MipsGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Mips/MipsGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Mips/MipsGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Mips/MipsGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Mips/MipsGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Mips/MipsGenDAGISel.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "-ignore-non-decodable-operands",
+                ],
+                "lib/Target/Mips/MipsGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Mips/MipsGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/Mips/MipsGenExegesis.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/Mips/MipsGenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/Mips/MipsGenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=MipsPostLegalizerCombiner",
+                ],
+                "lib/Target/Mips/MipsGenPostLegalizeGICombiner.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Mips/MipsGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/Mips/MipsGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/Mips/MipsGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Mips/MipsGenRegisterInfo.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoEnums.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoHeader.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Mips/MipsGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "MSP430",
         "short_name": "MSP430",
-        "tbl_outs": {
-            "lib/Target/MSP430/MSP430GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/MSP430/MSP430GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/MSP430/MSP430GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/MSP430/MSP430GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/MSP430/MSP430GenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/MSP430/MSP430GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/MSP430/MSP430GenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/MSP430/MSP430GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/MSP430/MSP430GenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/MSP430/MSP430GenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/MSP430/MSP430GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/MSP430/MSP430GenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/MSP430/MSP430GenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/MSP430/MSP430GenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/MSP430/MSP430GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/MSP430/MSP430GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/MSP430/MSP430GenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/MSP430/MSP430GenRegisterInfo.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoEnums.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoMCDesc.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoHeader.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/MSP430/MSP430GenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/MSP430/MSP430GenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "NVPTX",
         "short_name": "NVPTX",
-        "tbl_outs": {
-            "lib/Target/NVPTX/NVPTXGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/NVPTX/NVPTXGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/NVPTX/NVPTXGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/NVPTX/NVPTXGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfo.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoEnums.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoMCDesc.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoHeader.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/NVPTX/NVPTXGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/NVPTX/NVPTXGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/NVPTX/NVPTXGenDAGISel.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "PowerPC",
         "short_name": "PPC",
-        "tbl_outs": {
-            "lib/Target/PowerPC/PPCGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/PowerPC/PPCGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/PowerPC/PPCGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/PowerPC/PPCGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/PowerPC/PPCGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/PowerPC/PPCGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/PowerPC/PPCGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/PowerPC/PPCGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/PowerPC/PPCGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/PowerPC/PPCGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/PowerPC/PPCGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/PowerPC/PPCGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/PowerPC/PPCGenExegesis.inc": ["-gen-exegesis"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/PowerPC/PPCGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/PowerPC/PPCGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/PowerPC/PPCGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/PowerPC/PPCGenRegisterInfo.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoEnums.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoMCDesc.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoHeader.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/PowerPC/PPCGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/PowerPC/PPCGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/PowerPC/PPCGenFastISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/PowerPC/PPCGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/PowerPC/PPCGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/PowerPC/PPCGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/PowerPC/PPCGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/PowerPC/PPCGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/PowerPC/PPCGenExegesis.inc",
+            ),
+        ],
     },
     {
         "name": "RISCV",
         "short_name": "RISCV",
-        "tbl_outs": {
-            "lib/Target/RISCV/RISCVGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/RISCV/RISCVGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/RISCV/RISCVGenCompressInstEmitter.inc": ["-gen-compress-inst-emitter"],
-            "lib/Target/RISCV/RISCVGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/RISCV/RISCVGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "--specialize-decoders-per-bitwidth",
-            ],
-            "lib/Target/RISCV/RISCVGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/RISCV/RISCVGenMacroFusion.inc": ["-gen-macro-fusion-pred"],
-            "lib/Target/RISCV/RISCVGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/RISCV/RISCVGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/RISCV/RISCVGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/RISCV/RISCVGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/RISCV/RISCVGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/RISCV/RISCVGenSearchableTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/RISCV/RISCVGenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/RISCV/RISCVGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/RISCV/RISCVGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/RISCV/RISCVGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-compress-inst-emitter"],
+                "lib/Target/RISCV/RISCVGenCompressInstEmitter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/RISCV/RISCVGenDAGISel.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "--specialize-decoders-per-bitwidth",
+                ],
+                "lib/Target/RISCV/RISCVGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/RISCV/RISCVGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-macro-fusion-pred"],
+                "lib/Target/RISCV/RISCVGenMacroFusion.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/RISCV/RISCVGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/RISCV/RISCVGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/RISCV/RISCVGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/RISCV/RISCVGenRegisterInfo.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoEnums.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoMCDesc.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoHeader.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/RISCV/RISCVGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/RISCV/RISCVGenSearchableTables.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/RISCV/RISCVGenExegesis.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/RISCV/RISCVGenSDNodeInfo.inc",
+            ),
+        ],
         "tbl_deps": [
             ":riscv_isel_target_gen",
         ],
@@ -2473,135 +3017,404 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "Sparc",
         "short_name": "Sparc",
-        "tbl_outs": {
-            "lib/Target/Sparc/SparcGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Sparc/SparcGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Sparc/SparcGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Sparc/SparcGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Sparc/SparcGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Sparc/SparcGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Sparc/SparcGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Sparc/SparcGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/Sparc/SparcGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/Sparc/SparcGenSearchableTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/Sparc/SparcGenSDNodeInfo.inc": [
-                "-gen-sd-node-info",
-                "-sdnode-namespace=SPISD",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Sparc/SparcGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Sparc/SparcGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Sparc/SparcGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Sparc/SparcGenRegisterInfo.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoEnums.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoHeader.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Sparc/SparcGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Sparc/SparcGenDAGISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Sparc/SparcGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Sparc/SparcGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Sparc/SparcGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/Sparc/SparcGenSearchableTables.inc",
+            ),
+            (
+                [
+                    "-gen-sd-node-info",
+                    "-sdnode-namespace=SPISD",
+                ],
+                "lib/Target/Sparc/SparcGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "SPIRV",
         "short_name": "SPIRV",
-        "tbl_outs": {
-            "lib/Target/SPIRV/SPIRVGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/SPIRV/SPIRVGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/SPIRV/SPIRVGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/SPIRV/SPIRVGenPreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=SPIRVPreLegalizerCombiner",
-            ],
-            "lib/Target/SPIRV/SPIRVGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/SPIRV/SPIRVGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/SPIRV/SPIRVGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/SPIRV/SPIRVGenTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/SPIRV/SPIRVGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/SPIRV/SPIRVGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/SPIRV/SPIRVGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/SPIRV/SPIRVGenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=SPIRVPreLegalizerCombiner",
+                ],
+                "lib/Target/SPIRV/SPIRVGenPreLegalizeGICombiner.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/SPIRV/SPIRVGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/SPIRV/SPIRVGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfo.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoEnums.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoMCDesc.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoHeader.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/SPIRV/SPIRVGenTables.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/SPIRV/SPIRVGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "SystemZ",
         "short_name": "SystemZ",
-        "tbl_outs": {
-            "lib/Target/SystemZ/SystemZGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/SystemZ/SystemZGenGNUAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/SystemZ/SystemZGenHLASMAsmWriter.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/SystemZ/SystemZGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/SystemZ/SystemZGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/SystemZ/SystemZGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/SystemZ/SystemZGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/SystemZ/SystemZGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/SystemZ/SystemZGenGNUAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/SystemZ/SystemZGenHLASMAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/SystemZ/SystemZGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/SystemZ/SystemZGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/SystemZ/SystemZGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/SystemZ/SystemZGenRegisterInfo.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoEnums.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoMCDesc.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoHeader.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/SystemZ/SystemZGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "VE",
         "short_name": "VE",
-        "tbl_outs": {
-            "lib/Target/VE/VEGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/VE/VEGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/VE/VEGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/VE/VEGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/VE/VEGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/VE/VEGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/VE/VEGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/VE/VEGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/VE/VEGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/VE/VEGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/VE/VEGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/VE/VEGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/VE/VEGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/VE/VEGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/VE/VEGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/VE/VEGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/VE/VEGenRegisterInfo.inc",
+                    "lib/Target/VE/VEGenRegisterInfoEnums.inc",
+                    "lib/Target/VE/VEGenRegisterInfoMCDesc.inc",
+                    "lib/Target/VE/VEGenRegisterInfoHeader.inc",
+                    "lib/Target/VE/VEGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/VE/VEGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/VE/VEGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "WebAssembly",
         "short_name": "WebAssembly",
-        "tbl_outs": {
-            "lib/Target/WebAssembly/WebAssemblyGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/WebAssembly/WebAssemblyGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/WebAssembly/WebAssemblyGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/WebAssembly/WebAssemblyGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/WebAssembly/WebAssemblyGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/WebAssembly/WebAssemblyGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-disassembler"],
+                "lib/Target/WebAssembly/WebAssemblyGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/WebAssembly/WebAssemblyGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/WebAssembly/WebAssemblyGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/WebAssembly/WebAssemblyGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/WebAssembly/WebAssemblyGenFastISel.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/WebAssembly/WebAssemblyGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoEnums.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoMCDesc.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoHeader.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "X86",
         "short_name": "X86",
-        "tbl_outs": {
-            "lib/Target/X86/X86GenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/X86/X86GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/X86/X86GenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/X86/X86GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/X86/X86GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/X86/X86GenAsmWriter1.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/X86/X86GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/X86/X86GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/X86/X86GenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/X86/X86GenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/X86/X86GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/X86/X86GenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/X86/X86GenFoldTables.inc": [
-                "-gen-x86-fold-tables",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/X86/X86GenInstrMapping.inc": ["-gen-x86-instr-mapping"],
-            "lib/Target/X86/X86GenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/X86/X86GenMnemonicTables.inc": [
-                "-gen-x86-mnemonic-tables",
-                "-asmwriternum=1",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/X86/X86GenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/X86/X86GenRegisterInfo.inc",
+                    "lib/Target/X86/X86GenRegisterInfoEnums.inc",
+                    "lib/Target/X86/X86GenRegisterInfoMCDesc.inc",
+                    "lib/Target/X86/X86GenRegisterInfoHeader.inc",
+                    "lib/Target/X86/X86GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/X86/X86GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/X86/X86GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/X86/X86GenAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenAsmWriter1.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/X86/X86GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/X86/X86GenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/X86/X86GenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/X86/X86GenGlobalISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/X86/X86GenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/X86/X86GenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-x86-fold-tables",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenFoldTables.inc",
+            ),
+            (
+                ["-gen-x86-instr-mapping"],
+                "lib/Target/X86/X86GenInstrMapping.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/X86/X86GenExegesis.inc",
+            ),
+            (
+                [
+                    "-gen-x86-mnemonic-tables",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenMnemonicTables.inc",
+            ),
+        ],
     },
     {
         "name": "XCore",
         "short_name": "XCore",
-        "tbl_outs": {
-            "lib/Target/XCore/XCoreGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/XCore/XCoreGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/XCore/XCoreGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/XCore/XCoreGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/XCore/XCoreGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/XCore/XCoreGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/XCore/XCoreGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/XCore/XCoreGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/XCore/XCoreGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/XCore/XCoreGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/XCore/XCoreGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/XCore/XCoreGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/XCore/XCoreGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/XCore/XCoreGenRegisterInfo.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoEnums.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoMCDesc.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoHeader.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/XCore/XCoreGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/XCore/XCoreGenSubtargetInfo.inc",
+            ),
+        ],
     },
 ] if lib["name"] in llvm_targets]
 
@@ -2639,16 +3452,46 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "r600_target_gen",
     strip_include_prefix = "lib/Target/AMDGPU",
-    tbl_outs = {
-        "lib/Target/AMDGPU/R600GenAsmWriter.inc": ["-gen-asm-writer"],
-        "lib/Target/AMDGPU/R600GenCallingConv.inc": ["-gen-callingconv"],
-        "lib/Target/AMDGPU/R600GenDAGISel.inc": ["-gen-dag-isel"],
-        "lib/Target/AMDGPU/R600GenDFAPacketizer.inc": ["-gen-dfa-packetizer"],
-        "lib/Target/AMDGPU/R600GenInstrInfo.inc": ["-gen-instr-info"],
-        "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc": ["-gen-emitter"],
-        "lib/Target/AMDGPU/R600GenRegisterInfo.inc": ["-gen-register-info"],
-        "lib/Target/AMDGPU/R600GenSubtargetInfo.inc": ["-gen-subtarget"],
-    },
+    tbl_outs = [
+        (
+            ["-gen-asm-writer"],
+            "lib/Target/AMDGPU/R600GenAsmWriter.inc",
+        ),
+        (
+            ["-gen-callingconv"],
+            "lib/Target/AMDGPU/R600GenCallingConv.inc",
+        ),
+        (
+            ["-gen-dag-isel"],
+            "lib/Target/AMDGPU/R600GenDAGISel.inc",
+        ),
+        (
+            ["-gen-dfa-packetizer"],
+            "lib/Target/AMDGPU/R600GenDFAPacketizer.inc",
+        ),
+        (
+            ["-gen-instr-info"],
+            "lib/Target/AMDGPU/R600GenInstrInfo.inc",
+        ),
+        (
+            ["-gen-emitter"],
+            "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc",
+        ),
+        (
+            ["-gen-register-info"],
+            [
+                "lib/Target/AMDGPU/R600GenRegisterInfo.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoEnums.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoMCDesc.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoHeader.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoTargetDesc.inc",
+            ],
+        ),
+        (
+            ["-gen-subtarget"],
+            "lib/Target/AMDGPU/R600GenSubtargetInfo.inc",
+        ),
+    ],
     tblgen = ":llvm-tblgen",
     td_file = "lib/Target/AMDGPU/R600.td",
     deps = [
@@ -3381,7 +4224,10 @@ cc_library(
 gentbl_cc_library(
     name = "LibOptionsTableGen",
     strip_include_prefix = "lib/ToolDrivers/llvm-lib",
-    tbl_outs = {"lib/ToolDrivers/llvm-lib/Options.inc": ["-gen-opt-parser-defs"]},
+    tbl_outs = [(
+        ["-gen-opt-parser-defs"],
+        "lib/ToolDrivers/llvm-lib/Options.inc",
+    )],
     tblgen = ":llvm-tblgen",
     td_file = "lib/ToolDrivers/llvm-lib/Options.td",
     deps = [":OptParserTdFiles"],
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 153c7eeedd0ab..1421ec553f251 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10318,6 +10318,8 @@ cc_library(
     ),
     includes = ["include"],
     deps = [
+        ":FunctionInterfaces",
+        ":IR",
         ":OpenACCDialect",
         ":OpenACCOpsIncGen",
         ":OpenACCPassIncGen",
@@ -10325,6 +10327,7 @@ cc_library(
         ":Support",
         ":ViewLikeInterface",
         "//llvm:Support",
+        "//llvm:ir_headers",
     ],
 )
 
@@ -13105,6 +13108,7 @@ cc_library(
         ":RuntimeVerifiableOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
+        ":UBDialect",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",
diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 35888aac37e17..d28a8854fa896 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -153,7 +153,7 @@ def _gentbl_rule_impl(ctx):
     args.add("-o", ctx.outputs.out)
 
     ctx.actions.run(
-        outputs = [ctx.outputs.out],
+        outputs = [ctx.outputs.out] + ctx.outputs.additional_outputs,
         inputs = trans_srcs,
         executable = ctx.executable.tblgen,
         execution_requirements = {"supports-path-mapping": "1"},
@@ -195,6 +195,9 @@ gentbl_rule = rule(
             doc = "The output file for the TableGen invocation.",
             mandatory = True,
         ),
+        "additional_outputs": attr.output_list(
+            doc = "Extra output files from the TableGen invocation. The primary 'out' is used for the -o argument.",
+        ),
         "opts": attr.string_list(
             doc = "Additional command line options to add to the TableGen" +
                   " invocation. For include arguments, prefer to use" +
@@ -313,9 +316,12 @@ def gentbl_filegroup(
       name: The name of the generated filegroup rule for use in dependencies.
       tblgen: The binary used to produce the output.
       td_file: The primary table definitions file.
-      tbl_outs: Either a dict {out: [opts]} or a list of tuples ([opts], out),
-        where each 'opts' is a list of options passed to tblgen, each option
-        being a string, and 'out' is the corresponding output file produced.
+      tbl_outs: Either a dict {out: [opts]}, a list of tuples ([opts], out),
+        or a list of tuples ([opts], [outs]). Each 'opts' is a list of options
+        passed to tblgen, each option being a string,
+        and 'out' is the corresponding output file produced. If 'outs' are used,
+        the first path in the list is passed to '-o' but tblgen is expected
+        to produce all listed outputs.
       td_srcs: See gentbl_rule.td_srcs
       includes: See gentbl_rule.includes
       deps: See gentbl_rule.deps
@@ -325,9 +331,14 @@ def gentbl_filegroup(
       **kwargs: Extra keyword arguments to pass to all generated rules.
     """
 
+    included_srcs = []
     if type(tbl_outs) == type({}):
         tbl_outs = [(v, k) for k, v in tbl_outs.items()]
-    for (opts, out) in tbl_outs:
+    for (opts, output_or_outputs) in tbl_outs:
+        outs = output_or_outputs if type(output_or_outputs) == type([]) else [output_or_outputs]
+        out = outs[0]
+        if not any([skip_opt in opts for skip_opt in skip_opts]):
+            included_srcs.extend(outs)
         first_opt = opts[0] if opts else ""
         rule_suffix = "_{}_{}".format(
             first_opt.replace("-", "_").replace("=", "_"),
@@ -343,6 +354,7 @@ def gentbl_filegroup(
             deps = deps,
             includes = includes,
             out = out,
+            additional_outputs = outs[1:],
             **kwargs
         )
 
@@ -364,7 +376,6 @@ def gentbl_filegroup(
                 **kwargs
             )
 
-    included_srcs = [f for (opts, f) in tbl_outs if not any([skip_opt in opts for skip_opt in skip_opts])]
     native.filegroup(
         name = name,
         srcs = included_srcs,